From 3055a6cfa04ba4288589778925e8838261e56078 Mon Sep 17 00:00:00 2001
From: Eugen Hristev <eugen.hristev@microchip.com>
Date: Tue, 22 May 2018 10:52:32 +0300
Subject: iio: Add channel for Position Relative

Add new channel type for relative position on a pad.

These type of analog sensor offers the position of a pen
on a touchpad, and is represented as a voltage, which can be
converted to a position on X and Y axis on the pad.
The channel will hand the relative position on the pad in both directions.

The channel can then be consumed by a touchscreen driver or
read as-is for a raw indication of the touchpen on a touchpad.

Signed-off-by: Eugen Hristev <eugen.hristev@microchip.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/uapi/linux/iio/types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index 4213cdf88e3c..033c7d28924e 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -44,6 +44,7 @@ enum iio_chan_type {
 	IIO_COUNT,
 	IIO_INDEX,
 	IIO_GRAVITY,
+	IIO_POSITIONRELATIVE,
 };
 
 enum iio_modifier {
-- 
cgit v1.2.3


From 2e076f199097d670ce5e5492cea57f552b93bba9 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 28 May 2018 15:47:40 +0200
Subject: nl80211: add scan features for improved scan privacy

Add the scan flags for randomized SN and minimized probe request
content for improved scan privacy.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/uapi/linux/nl80211.h | 15 +++++++++++++++
 net/wireless/nl80211.c       |  8 +++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 28b36545de24..49f718e821a3 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5133,6 +5133,11 @@ enum nl80211_feature_flags {
  *	support to nl80211.
  * @NL80211_EXT_FEATURE_TXQS: Driver supports FQ-CoDel-enabled intermediate
  *      TXQs.
+ * @NL80211_EXT_FEATURE_SCAN_RANDOM_SN: Driver/device supports randomizing the
+ *	SN in probe request frames if requested by %NL80211_SCAN_FLAG_RANDOM_SN.
+ * @NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT: Driver/device can omit all data
+ *	except for supported rates from the probe request content if requested
+ *	by the %NL80211_SCAN_FLAG_MIN_PREQ_CONTENT flag.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -5167,6 +5172,8 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211,
 	NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT,
 	NL80211_EXT_FEATURE_TXQS,
+	NL80211_EXT_FEATURE_SCAN_RANDOM_SN,
+	NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
@@ -5272,6 +5279,12 @@ enum nl80211_timeout_reason {
  *	possible scan results. This flag hints the driver to use the best
  *	possible scan configuration to improve the accuracy in scanning.
  *	Latency and power use may get impacted with this flag.
+ * @NL80211_SCAN_FLAG_RANDOM_SN: randomize the sequence number in probe
+ *	request frames from this scan to avoid correlation/tracking being
+ *	possible.
+ * @NL80211_SCAN_FLAG_MIN_PREQ_CONTENT: minimize probe request content to
+ *	only have supported rates and no additional capabilities (unless
+ *	added by userspace explicitly.)
  */
 enum nl80211_scan_flags {
 	NL80211_SCAN_FLAG_LOW_PRIORITY				= 1<<0,
@@ -5285,6 +5298,8 @@ enum nl80211_scan_flags {
 	NL80211_SCAN_FLAG_LOW_SPAN				= 1<<8,
 	NL80211_SCAN_FLAG_LOW_POWER				= 1<<9,
 	NL80211_SCAN_FLAG_HIGH_ACCURACY				= 1<<10,
+	NL80211_SCAN_FLAG_RANDOM_SN				= 1<<11,
+	NL80211_SCAN_FLAG_MIN_PREQ_CONTENT			= 1<<12,
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 6c3ded1223fb..d2677259e13e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6925,7 +6925,13 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
 				     NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) ||
 	    !nl80211_check_scan_feat(wiphy, *flags,
 				     NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE,
-				     NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE))
+				     NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE) ||
+	    !nl80211_check_scan_feat(wiphy, *flags,
+				     NL80211_SCAN_FLAG_RANDOM_SN,
+				     NL80211_EXT_FEATURE_SCAN_RANDOM_SN) ||
+	    !nl80211_check_scan_feat(wiphy, *flags,
+				     NL80211_SCAN_FLAG_MIN_PREQ_CONTENT,
+				     NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT))
 		return -EOPNOTSUPP;
 
 	if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
-- 
cgit v1.2.3


From c4cbaf7973a794839af080f13748335976cf3f3f Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Sat, 9 Jun 2018 09:14:42 +0300
Subject: cfg80211: Add support for HE

Add support for the HE in cfg80211 and also add userspace API to
nl80211 to send rate information out, conforming with P802.11ax_D2.0.

Signed-off-by: Liad Kaufman <liad.kaufman@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Ido Yariv <idox.yariv@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/linux/ieee80211.h    | 427 +++++++++++++++++++++++++++++++++++++++++++
 include/net/cfg80211.h       | 106 ++++++++++-
 include/uapi/linux/nl80211.h |  87 ++++++++-
 net/wireless/core.c          |  21 ++-
 net/wireless/nl80211.c       |  99 +++++++++-
 net/wireless/util.c          |  82 +++++++++
 6 files changed, 817 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 8fe7e4306816..e6a6503bfa33 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1539,6 +1539,106 @@ struct ieee80211_vht_operation {
 	__le16 basic_mcs_set;
 } __packed;
 
+/**
+ * struct ieee80211_he_cap_elem - HE capabilities element
+ *
+ * This structure is the "HE capabilities element" fixed fields as
+ * described in P802.11ax_D2.0 section 9.4.2.237.2 and 9.4.2.237.3
+ */
+struct ieee80211_he_cap_elem {
+	u8 mac_cap_info[5];
+	u8 phy_cap_info[9];
+} __packed;
+
+#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN	5
+
+/**
+ * enum ieee80211_he_mcs_support - HE MCS support definitions
+ * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the
+ *	number of streams
+ * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported
+ * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported
+ * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported
+ *
+ * These definitions are used in each 2-bit subfield of the rx_mcs_*
+ * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are
+ * both split into 8 subfields by number of streams. These values indicate
+ * which MCSes are supported for the number of streams the value appears
+ * for.
+ */
+enum ieee80211_he_mcs_support {
+	IEEE80211_HE_MCS_SUPPORT_0_7	= 0,
+	IEEE80211_HE_MCS_SUPPORT_0_9	= 1,
+	IEEE80211_HE_MCS_SUPPORT_0_11	= 2,
+	IEEE80211_HE_MCS_NOT_SUPPORTED	= 3,
+};
+
+/**
+ * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field
+ *
+ * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field
+ * described in P802.11ax_D2.0 section 9.4.2.237.4
+ *
+ * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     widths less than 80MHz.
+ * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     widths less than 80MHz.
+ * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     width 160MHz.
+ * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     width 160MHz.
+ * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for
+ *     channel width 80p80MHz.
+ * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for
+ *     channel width 80p80MHz.
+ */
+struct ieee80211_he_mcs_nss_supp {
+	__le16 rx_mcs_80;
+	__le16 tx_mcs_80;
+	__le16 rx_mcs_160;
+	__le16 tx_mcs_160;
+	__le16 rx_mcs_80p80;
+	__le16 tx_mcs_80p80;
+} __packed;
+
+/**
+ * struct ieee80211_he_operation - HE capabilities element
+ *
+ * This structure is the "HE operation element" fields as
+ * described in P802.11ax_D2.0 section 9.4.2.238
+ */
+struct ieee80211_he_operation {
+	__le32 he_oper_params;
+	__le16 he_mcs_nss_set;
+	/* Optional 0,1,3 or 4 bytes: depends on @he_oper_params */
+	u8 optional[0];
+} __packed;
+
+/**
+ * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field
+ *
+ * This structure is the "MU AC Parameter Record" fields as
+ * described in P802.11ax_D2.0 section 9.4.2.240
+ */
+struct ieee80211_he_mu_edca_param_ac_rec {
+	u8 aifsn;
+	u8 ecw_min_max;
+	u8 mu_edca_timer;
+} __packed;
+
+/**
+ * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element
+ *
+ * This structure is the "MU EDCA Parameter Set element" fields as
+ * described in P802.11ax_D2.0 section 9.4.2.240
+ */
+struct ieee80211_mu_edca_param_set {
+	u8 mu_qos_info;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_be;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_bk;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_vi;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_vo;
+} __packed;
 
 /* 802.11ac VHT Capabilities */
 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895			0x00000000
@@ -1577,6 +1677,328 @@ struct ieee80211_vht_operation {
 #define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN			0x10000000
 #define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN			0x20000000
 
+/* 802.11ax HE MAC capabilities */
+#define IEEE80211_HE_MAC_CAP0_HTC_HE				0x01
+#define IEEE80211_HE_MAC_CAP0_TWT_REQ				0x02
+#define IEEE80211_HE_MAC_CAP0_TWT_RES				0x04
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP		0x00
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1		0x08
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2		0x10
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3		0x18
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK			0x18
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1		0x00
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2		0x20
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4		0x40
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8		0x60
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16		0x80
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32		0xa0
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64		0xc0
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED	0xe0
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK		0xe0
+
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED		0x00
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128			0x01
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256			0x02
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512			0x03
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK		0x03
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US		0x00
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US		0x04
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US		0x08
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK		0x0c
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_1		0x00
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_2		0x10
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_3		0x20
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_4		0x30
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_5		0x40
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_6		0x50
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_7		0x60
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_8		0x70
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_MASK		0x70
+
+/* Link adaptation is split between byte HE_MAC_CAP1 and
+ * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE
+ * in which case the following values apply:
+ * 0 = No feedback.
+ * 1 = reserved.
+ * 2 = Unsolicited feedback.
+ * 3 = both
+ */
+#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION			0x80
+
+#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION			0x01
+#define IEEE80211_HE_MAC_CAP2_ALL_ACK				0x02
+#define IEEE80211_HE_MAC_CAP2_UL_MU_RESP_SCHED			0x04
+#define IEEE80211_HE_MAC_CAP2_BSR				0x08
+#define IEEE80211_HE_MAC_CAP2_BCAST_TWT				0x10
+#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP			0x20
+#define IEEE80211_HE_MAC_CAP2_MU_CASCADING			0x40
+#define IEEE80211_HE_MAC_CAP2_ACK_EN				0x80
+
+#define IEEE80211_HE_MAC_CAP3_GRP_ADDR_MULTI_STA_BA_DL_MU	0x01
+#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL			0x02
+#define IEEE80211_HE_MAC_CAP3_OFDMA_RA				0x04
+
+/* The maximum length of an A-MDPU is defined by the combination of the Maximum
+ * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the
+ * same field in the HE capabilities.
+ */
+#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_USE_VHT	0x00
+#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_1		0x08
+#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_2		0x10
+#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_RESERVED	0x18
+#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_MASK		0x18
+#define IEEE80211_HE_MAC_CAP3_A_AMSDU_FRAG			0x20
+#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED			0x40
+#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS		0x80
+
+#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG		0x01
+#define IEEE80211_HE_MAC_CAP4_QTP				0x02
+#define IEEE80211_HE_MAC_CAP4_BQR				0x04
+#define IEEE80211_HE_MAC_CAP4_SR_RESP				0x08
+#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP			0x10
+#define IEEE80211_HE_MAC_CAP4_OPS				0x20
+#define IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU			0x40
+
+/* 802.11ax HE PHY capabilities */
+#define IEEE80211_HE_PHY_CAP0_DUAL_BAND					0x01
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G		0x02
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G	0x04
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G		0x08
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G	0x10
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G	0x20
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G	0x40
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK			0xfe
+
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ	0x01
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ	0x02
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ	0x04
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ	0x08
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK			0x0f
+#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A				0x10
+#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD			0x20
+#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US		0x40
+/* Midamble RX Max NSTS is split between byte #2 and byte #3 */
+#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_MAX_NSTS			0x80
+
+#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_MAX_NSTS			0x01
+#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US			0x02
+#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ			0x04
+#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ			0x08
+#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX				0x10
+#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX				0x20
+
+/* Note that the meaning of UL MU below is different between an AP and a non-AP
+ * sta, where in the AP case it indicates support for Rx and in the non-AP sta
+ * case it indicates support for Tx.
+ */
+#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO			0x40
+#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO			0x80
+
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM			0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK			0x01
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK			0x02
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM			0x03
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK			0x03
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1				0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2				0x04
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM			0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK			0x08
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK			0x10
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM			0x18
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK			0x18
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1				0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2				0x20
+#define IEEE80211_HE_PHY_CAP3_RX_HE_MU_PPDU_FROM_NON_AP_STA		0x40
+#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER				0x80
+
+#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE				0x01
+#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER				0x02
+
+/* Minimal allowed value of Max STS under 80MHz is 3 */
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4		0x0c
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5		0x10
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6		0x14
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7		0x18
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8		0x1c
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK	0x1c
+
+/* Minimal allowed value of Max STS above 80MHz is 3 */
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4		0x60
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5		0x80
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6		0xa0
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7		0xc0
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8		0xe0
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK	0xe0
+
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1	0x00
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2	0x01
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3	0x02
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4	0x03
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5	0x04
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6	0x05
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7	0x06
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8	0x07
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK	0x07
+
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1	0x00
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2	0x08
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3	0x10
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4	0x18
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5	0x20
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6	0x28
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7	0x30
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8	0x38
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK	0x38
+
+#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK				0x40
+#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK				0x80
+
+#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU			0x01
+#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU			0x02
+#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB			0x04
+#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB			0x08
+#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB				0x10
+#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE			0x20
+#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO		0x40
+#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT			0x80
+
+#define IEEE80211_HE_PHY_CAP7_SRP_BASED_SR				0x01
+#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR			0x02
+#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI		0x04
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_1					0x08
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_2					0x10
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_3					0x18
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_4					0x20
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_5					0x28
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_6					0x30
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_7					0x38
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK				0x38
+#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ			0x40
+#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ			0x80
+
+#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI		0x01
+#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G		0x02
+#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU			0x04
+#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU			0x08
+#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI		0x10
+#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_2X_AND_1XLTF			0x20
+
+/* 802.11ax HE TX/RX MCS NSS Support  */
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS			(3)
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS			(6)
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS			(11)
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK			0x07c0
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK			0xf800
+
+/* TX/RX HE MCS Support field Highest MCS subfield encoding */
+enum ieee80211_he_highest_mcs_supported_subfield_enc {
+	HIGHEST_MCS_SUPPORTED_MCS7 = 0,
+	HIGHEST_MCS_SUPPORTED_MCS8,
+	HIGHEST_MCS_SUPPORTED_MCS9,
+	HIGHEST_MCS_SUPPORTED_MCS10,
+	HIGHEST_MCS_SUPPORTED_MCS11,
+};
+
+/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */
+static inline u8
+ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap)
+{
+	u8 count = 4;
+
+	if (he_cap->phy_cap_info[0] &
+	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)
+		count += 4;
+
+	if (he_cap->phy_cap_info[0] &
+	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G)
+		count += 4;
+
+	return count;
+}
+
+/* 802.11ax HE PPE Thresholds */
+#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS			(1)
+#define IEEE80211_PPE_THRES_NSS_POS				(0)
+#define IEEE80211_PPE_THRES_NSS_MASK				(7)
+#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU	\
+	(BIT(5) | BIT(6))
+#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK		0x78
+#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS		(3)
+#define IEEE80211_PPE_THRES_INFO_PPET_SIZE			(3)
+
+/*
+ * Calculate 802.11ax HE capabilities IE PPE field size
+ * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8*
+ */
+static inline u8
+ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info)
+{
+	u8 n;
+
+	if ((phy_cap_info[6] &
+	     IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0)
+		return 0;
+
+	n = hweight8(ppe_thres_hdr &
+		     IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK);
+	n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >>
+		   IEEE80211_PPE_THRES_NSS_POS));
+
+	/*
+	 * Each pair is 6 bits, and we need to add the 7 "header" bits to the
+	 * total size.
+	 */
+	n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7;
+	n = DIV_ROUND_UP(n, 8);
+
+	return n;
+}
+
+/* HE Operation defines */
+#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK			0x0000003f
+#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK		0x000001c0
+#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET		6
+#define IEEE80211_HE_OPERATION_TWT_REQUIRED			0x00000200
+#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK		0x000ffc00
+#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET		10
+#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR		0x000100000
+#define IEEE80211_HE_OPERATION_VHT_OPER_INFO			0x000200000
+#define IEEE80211_HE_OPERATION_MULTI_BSSID_AP			0x10000000
+#define IEEE80211_HE_OPERATION_TX_BSSID_INDICATOR		0x20000000
+#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED		0x40000000
+
+/*
+ * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size
+ * @he_oper_ie: byte data of the He Operations IE, stating from the the byte
+ *	after the ext ID byte. It is assumed that he_oper_ie has at least
+ *	sizeof(struct ieee80211_he_operation) bytes, checked already in
+ *	ieee802_11_parse_elems_crc()
+ * @return the actual size of the IE data (not including header), or 0 on error
+ */
+static inline u8
+ieee80211_he_oper_size(const u8 *he_oper_ie)
+{
+	struct ieee80211_he_operation *he_oper = (void *)he_oper_ie;
+	u8 oper_len = sizeof(struct ieee80211_he_operation);
+	u32 he_oper_params;
+
+	/* Make sure the input is not NULL */
+	if (!he_oper_ie)
+		return 0;
+
+	/* Calc required length */
+	he_oper_params = le32_to_cpu(he_oper->he_oper_params);
+	if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO)
+		oper_len += 3;
+	if (he_oper_params & IEEE80211_HE_OPERATION_MULTI_BSSID_AP)
+		oper_len++;
+
+	/* Add the first byte (extension ID) to the total length */
+	oper_len++;
+
+	return oper_len;
+}
+
 /* Authentication algorithms */
 #define WLAN_AUTH_OPEN 0
 #define WLAN_AUTH_SHARED_KEY 1
@@ -1992,6 +2414,11 @@ enum ieee80211_eid_ext {
 	WLAN_EID_EXT_FILS_WRAPPED_DATA = 8,
 	WLAN_EID_EXT_FILS_PUBLIC_KEY = 12,
 	WLAN_EID_EXT_FILS_NONCE = 13,
+	WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE = 14,
+	WLAN_EID_EXT_HE_CAPABILITY = 35,
+	WLAN_EID_EXT_HE_OPERATION = 36,
+	WLAN_EID_EXT_UORA = 37,
+	WLAN_EID_EXT_HE_MU_EDCA = 38,
 };
 
 /* Action category code */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5fbfe61f41c6..9ba1f289c439 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -285,6 +285,41 @@ struct ieee80211_sta_vht_cap {
 	struct ieee80211_vht_mcs_info vht_mcs;
 };
 
+#define IEEE80211_HE_PPE_THRES_MAX_LEN		25
+
+/**
+ * struct ieee80211_sta_he_cap - STA's HE capabilities
+ *
+ * This structure describes most essential parameters needed
+ * to describe 802.11ax HE capabilities for a STA.
+ *
+ * @has_he: true iff HE data is valid.
+ * @he_cap_elem: Fixed portion of the HE capabilities element.
+ * @he_mcs_nss_supp: The supported NSS/MCS combinations.
+ * @ppe_thres: Holds the PPE Thresholds data.
+ */
+struct ieee80211_sta_he_cap {
+	bool has_he;
+	struct ieee80211_he_cap_elem he_cap_elem;
+	struct ieee80211_he_mcs_nss_supp he_mcs_nss_supp;
+	u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN];
+};
+
+/**
+ * struct ieee80211_sband_iftype_data
+ *
+ * This structure encapsulates sband data that is relevant for the
+ * interface types defined in @types_mask.  Each type in the
+ * @types_mask must be unique across all instances of iftype_data.
+ *
+ * @types_mask: interface types mask
+ * @he_cap: holds the HE capabilities
+ */
+struct ieee80211_sband_iftype_data {
+	u16 types_mask;
+	struct ieee80211_sta_he_cap he_cap;
+};
+
 /**
  * struct ieee80211_supported_band - frequency band definition
  *
@@ -301,6 +336,11 @@ struct ieee80211_sta_vht_cap {
  * @n_bitrates: Number of bitrates in @bitrates
  * @ht_cap: HT capabilities in this band
  * @vht_cap: VHT capabilities in this band
+ * @n_iftype_data: number of iftype data entries
+ * @iftype_data: interface type data entries.  Note that the bits in
+ *	@types_mask inside this structure cannot overlap (i.e. only
+ *	one occurrence of each type is allowed across all instances of
+ *	iftype_data).
  */
 struct ieee80211_supported_band {
 	struct ieee80211_channel *channels;
@@ -310,8 +350,55 @@ struct ieee80211_supported_band {
 	int n_bitrates;
 	struct ieee80211_sta_ht_cap ht_cap;
 	struct ieee80211_sta_vht_cap vht_cap;
+	u16 n_iftype_data;
+	const struct ieee80211_sband_iftype_data *iftype_data;
 };
 
+/**
+ * ieee80211_get_sband_iftype_data - return sband data for a given iftype
+ * @sband: the sband to search for the STA on
+ * @iftype: enum nl80211_iftype
+ *
+ * Return: pointer to struct ieee80211_sband_iftype_data, or NULL is none found
+ */
+static inline const struct ieee80211_sband_iftype_data *
+ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
+				u8 iftype)
+{
+	int i;
+
+	if (WARN_ON(iftype >= NL80211_IFTYPE_MAX))
+		return NULL;
+
+	for (i = 0; i < sband->n_iftype_data; i++)  {
+		const struct ieee80211_sband_iftype_data *data =
+			&sband->iftype_data[i];
+
+		if (data->types_mask & BIT(iftype))
+			return data;
+	}
+
+	return NULL;
+}
+
+/**
+ * ieee80211_get_he_sta_cap - return HE capabilities for an sband's STA
+ * @sband: the sband to search for the STA on
+ *
+ * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found
+ */
+static inline const struct ieee80211_sta_he_cap *
+ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband)
+{
+	const struct ieee80211_sband_iftype_data *data =
+		ieee80211_get_sband_iftype_data(sband, NL80211_IFTYPE_STATION);
+
+	if (data && data->he_cap.has_he)
+		return &data->he_cap;
+
+	return NULL;
+}
+
 /**
  * wiphy_read_of_freq_limits - read frequency limits from device tree
  *
@@ -899,6 +986,8 @@ enum station_parameters_apply_mask {
  * @opmode_notif: operating mode field from Operating Mode Notification
  * @opmode_notif_used: information if operating mode field is used
  * @support_p2p_ps: information if station supports P2P PS mechanism
+ * @he_capa: HE capabilities of station
+ * @he_capa_len: the length of the HE capabilities
  */
 struct station_parameters {
 	const u8 *supported_rates;
@@ -926,6 +1015,8 @@ struct station_parameters {
 	u8 opmode_notif;
 	bool opmode_notif_used;
 	int support_p2p_ps;
+	const struct ieee80211_he_cap_elem *he_capa;
+	u8 he_capa_len;
 };
 
 /**
@@ -1000,12 +1091,14 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
  * @RATE_INFO_FLAGS_VHT_MCS: mcs field filled with VHT MCS
  * @RATE_INFO_FLAGS_SHORT_GI: 400ns guard interval
  * @RATE_INFO_FLAGS_60G: 60GHz MCS
+ * @RATE_INFO_FLAGS_HE_MCS: HE MCS information
  */
 enum rate_info_flags {
 	RATE_INFO_FLAGS_MCS			= BIT(0),
 	RATE_INFO_FLAGS_VHT_MCS			= BIT(1),
 	RATE_INFO_FLAGS_SHORT_GI		= BIT(2),
 	RATE_INFO_FLAGS_60G			= BIT(3),
+	RATE_INFO_FLAGS_HE_MCS			= BIT(4),
 };
 
 /**
@@ -1019,6 +1112,7 @@ enum rate_info_flags {
  * @RATE_INFO_BW_40: 40 MHz bandwidth
  * @RATE_INFO_BW_80: 80 MHz bandwidth
  * @RATE_INFO_BW_160: 160 MHz bandwidth
+ * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation
  */
 enum rate_info_bw {
 	RATE_INFO_BW_20 = 0,
@@ -1027,6 +1121,7 @@ enum rate_info_bw {
 	RATE_INFO_BW_40,
 	RATE_INFO_BW_80,
 	RATE_INFO_BW_160,
+	RATE_INFO_BW_HE_RU,
 };
 
 /**
@@ -1035,10 +1130,14 @@ enum rate_info_bw {
  * Information about a receiving or transmitting bitrate
  *
  * @flags: bitflag of flags from &enum rate_info_flags
- * @mcs: mcs index if struct describes a 802.11n bitrate
+ * @mcs: mcs index if struct describes an HT/VHT/HE rate
  * @legacy: bitrate in 100kbit/s for 802.11abg
- * @nss: number of streams (VHT only)
+ * @nss: number of streams (VHT & HE only)
  * @bw: bandwidth (from &enum rate_info_bw)
+ * @he_gi: HE guard interval (from &enum nl80211_he_gi)
+ * @he_dcm: HE DCM value
+ * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc,
+ *	only valid if bw is %RATE_INFO_BW_HE_RU)
  */
 struct rate_info {
 	u8 flags;
@@ -1046,6 +1145,9 @@ struct rate_info {
 	u16 legacy;
 	u8 nss;
 	u8 bw;
+	u8 he_gi;
+	u8 he_dcm;
+	u8 he_ru_alloc;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 49f718e821a3..f82ce3c89ab7 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2237,6 +2237,9 @@ enum nl80211_commands {
  *      enforced.
  * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes
  *      a flow is assigned on each round of the DRR scheduler.
+ * @NL80211_ATTR_HE_CAPABILITY: HE Capability information element (from
+ *	association request when used with NL80211_CMD_NEW_STATION). Can be set
+ *	only if %NL80211_STA_FLAG_WME is set.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2677,6 +2680,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_TXQ_MEMORY_LIMIT,
 	NL80211_ATTR_TXQ_QUANTUM,
 
+	NL80211_ATTR_HE_CAPABILITY,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -2726,7 +2731,8 @@ enum nl80211_attrs {
 #define NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY	24
 #define NL80211_HT_CAPABILITY_LEN		26
 #define NL80211_VHT_CAPABILITY_LEN		12
-
+#define NL80211_HE_MIN_CAPABILITY_LEN           16
+#define NL80211_HE_MAX_CAPABILITY_LEN           51
 #define NL80211_MAX_NR_CIPHER_SUITES		5
 #define NL80211_MAX_NR_AKM_SUITES		2
 
@@ -2853,6 +2859,38 @@ struct nl80211_sta_flag_update {
 	__u32 set;
 } __attribute__((packed));
 
+/**
+ * enum nl80211_he_gi - HE guard interval
+ * @NL80211_RATE_INFO_HE_GI_0_8: 0.8 usec
+ * @NL80211_RATE_INFO_HE_GI_1_6: 1.6 usec
+ * @NL80211_RATE_INFO_HE_GI_3_2: 3.2 usec
+ */
+enum nl80211_he_gi {
+	NL80211_RATE_INFO_HE_GI_0_8,
+	NL80211_RATE_INFO_HE_GI_1_6,
+	NL80211_RATE_INFO_HE_GI_3_2,
+};
+
+/**
+ * enum nl80211_he_ru_alloc - HE RU allocation values
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_26: 26-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_52: 52-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_106: 106-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_242: 242-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_484: 484-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_996: 996-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_2x996: 2x996-tone RU allocation
+ */
+enum nl80211_he_ru_alloc {
+	NL80211_RATE_INFO_HE_RU_ALLOC_26,
+	NL80211_RATE_INFO_HE_RU_ALLOC_52,
+	NL80211_RATE_INFO_HE_RU_ALLOC_106,
+	NL80211_RATE_INFO_HE_RU_ALLOC_242,
+	NL80211_RATE_INFO_HE_RU_ALLOC_484,
+	NL80211_RATE_INFO_HE_RU_ALLOC_996,
+	NL80211_RATE_INFO_HE_RU_ALLOC_2x996,
+};
+
 /**
  * enum nl80211_rate_info - bitrate information
  *
@@ -2885,6 +2923,13 @@ struct nl80211_sta_flag_update {
  * @NL80211_RATE_INFO_5_MHZ_WIDTH: 5 MHz width - note that this is
  *	a legacy rate and will be reported as the actual bitrate, i.e.
  *	a quarter of the base (20 MHz) rate
+ * @NL80211_RATE_INFO_HE_MCS: HE MCS index (u8, 0-11)
+ * @NL80211_RATE_INFO_HE_NSS: HE NSS value (u8, 1-8)
+ * @NL80211_RATE_INFO_HE_GI: HE guard interval identifier
+ *	(u8, see &enum nl80211_he_gi)
+ * @NL80211_RATE_INFO_HE_DCM: HE DCM value (u8, 0/1)
+ * @NL80211_RATE_INFO_RU_ALLOC: HE RU allocation, if not present then
+ *	non-OFDMA was used (u8, see &enum nl80211_he_ru_alloc)
  * @__NL80211_RATE_INFO_AFTER_LAST: internal use
  */
 enum nl80211_rate_info {
@@ -2901,6 +2946,11 @@ enum nl80211_rate_info {
 	NL80211_RATE_INFO_160_MHZ_WIDTH,
 	NL80211_RATE_INFO_10_MHZ_WIDTH,
 	NL80211_RATE_INFO_5_MHZ_WIDTH,
+	NL80211_RATE_INFO_HE_MCS,
+	NL80211_RATE_INFO_HE_NSS,
+	NL80211_RATE_INFO_HE_GI,
+	NL80211_RATE_INFO_HE_DCM,
+	NL80211_RATE_INFO_HE_RU_ALLOC,
 
 	/* keep last */
 	__NL80211_RATE_INFO_AFTER_LAST,
@@ -3166,6 +3216,38 @@ enum nl80211_mpath_info {
 	NL80211_MPATH_INFO_MAX = __NL80211_MPATH_INFO_AFTER_LAST - 1
 };
 
+/**
+ * enum nl80211_band_iftype_attr - Interface type data attributes
+ *
+ * @__NL80211_BAND_IFTYPE_ATTR_INVALID: attribute number 0 is reserved
+ * @NL80211_BAND_IFTYPE_ATTR_IFTYPES: nested attribute containing a flag attribute
+ *     for each interface type that supports the band data
+ * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC: HE MAC capabilities as in HE
+ *     capabilities IE
+ * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY: HE PHY capabilities as in HE
+ *     capabilities IE
+ * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET: HE supported NSS/MCS as in HE
+ *     capabilities IE
+ * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE: HE PPE thresholds information as
+ *     defined in HE capabilities IE
+ * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band HE capability attribute currently
+ *     defined
+ * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use
+ */
+enum nl80211_band_iftype_attr {
+	__NL80211_BAND_IFTYPE_ATTR_INVALID,
+
+	NL80211_BAND_IFTYPE_ATTR_IFTYPES,
+	NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC,
+	NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY,
+	NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET,
+	NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE,
+
+	/* keep last */
+	__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST,
+	NL80211_BAND_IFTYPE_ATTR_MAX = __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST - 1
+};
+
 /**
  * enum nl80211_band_attr - band attributes
  * @__NL80211_BAND_ATTR_INVALID: attribute number 0 is reserved
@@ -3181,6 +3263,8 @@ enum nl80211_mpath_info {
  * @NL80211_BAND_ATTR_VHT_MCS_SET: 32-byte attribute containing the MCS set as
  *	defined in 802.11ac
  * @NL80211_BAND_ATTR_VHT_CAPA: VHT capabilities, as in the HT information IE
+ * @NL80211_BAND_ATTR_IFTYPE_DATA: nested array attribute, with each entry using
+ *	attributes from &enum nl80211_band_iftype_attr
  * @NL80211_BAND_ATTR_MAX: highest band attribute currently defined
  * @__NL80211_BAND_ATTR_AFTER_LAST: internal use
  */
@@ -3196,6 +3280,7 @@ enum nl80211_band_attr {
 
 	NL80211_BAND_ATTR_VHT_MCS_SET,
 	NL80211_BAND_ATTR_VHT_CAPA,
+	NL80211_BAND_ATTR_IFTYPE_DATA,
 
 	/* keep last */
 	__NL80211_BAND_ATTR_AFTER_LAST,
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 5fe35aafdd9c..d23abc619e77 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -3,7 +3,7 @@
  *
  * Copyright 2006-2010		Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2015	Intel Deutschland GmbH
+ * Copyright 2015-2017	Intel Deutschland GmbH
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -744,6 +744,8 @@ int wiphy_register(struct wiphy *wiphy)
 
 	/* sanity check supported bands/channels */
 	for (band = 0; band < NUM_NL80211_BANDS; band++) {
+		u16 types = 0;
+
 		sband = wiphy->bands[band];
 		if (!sband)
 			continue;
@@ -788,6 +790,23 @@ int wiphy_register(struct wiphy *wiphy)
 			sband->channels[i].band = band;
 		}
 
+		for (i = 0; i < sband->n_iftype_data; i++) {
+			const struct ieee80211_sband_iftype_data *iftd;
+
+			iftd = &sband->iftype_data[i];
+
+			if (WARN_ON(!iftd->types_mask))
+				return -EINVAL;
+			if (WARN_ON(types & iftd->types_mask))
+				return -EINVAL;
+
+			/* at least one piece of information must be present */
+			if (WARN_ON(!iftd->he_cap.has_he))
+				return -EINVAL;
+
+			types |= iftd->types_mask;
+		}
+
 		have_band = true;
 	}
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7b21914ae18b..0ccce338a66e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -428,6 +428,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
 	[NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
 	[NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
+	[NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY,
+					 .len = NL80211_HE_MAX_CAPABILITY_LEN },
 };
 
 /* policy for the key attributes */
@@ -1324,6 +1326,34 @@ static int nl80211_send_coalesce(struct sk_buff *msg,
 	return 0;
 }
 
+static int
+nl80211_send_iftype_data(struct sk_buff *msg,
+			 const struct ieee80211_sband_iftype_data *iftdata)
+{
+	const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap;
+
+	if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES,
+				iftdata->types_mask))
+		return -ENOBUFS;
+
+	if (he_cap->has_he) {
+		if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC,
+			    sizeof(he_cap->he_cap_elem.mac_cap_info),
+			    he_cap->he_cap_elem.mac_cap_info) ||
+		    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY,
+			    sizeof(he_cap->he_cap_elem.phy_cap_info),
+			    he_cap->he_cap_elem.phy_cap_info) ||
+		    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET,
+			    sizeof(he_cap->he_mcs_nss_supp),
+			    &he_cap->he_mcs_nss_supp) ||
+		    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE,
+			    sizeof(he_cap->ppe_thres), he_cap->ppe_thres))
+			return -ENOBUFS;
+	}
+
+	return 0;
+}
+
 static int nl80211_send_band_rateinfo(struct sk_buff *msg,
 				      struct ieee80211_supported_band *sband)
 {
@@ -1353,6 +1383,32 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
 			 sband->vht_cap.cap)))
 		return -ENOBUFS;
 
+	if (sband->n_iftype_data) {
+		struct nlattr *nl_iftype_data =
+			nla_nest_start(msg, NL80211_BAND_ATTR_IFTYPE_DATA);
+		int err;
+
+		if (!nl_iftype_data)
+			return -ENOBUFS;
+
+		for (i = 0; i < sband->n_iftype_data; i++) {
+			struct nlattr *iftdata;
+
+			iftdata = nla_nest_start(msg, i + 1);
+			if (!iftdata)
+				return -ENOBUFS;
+
+			err = nl80211_send_iftype_data(msg,
+						       &sband->iftype_data[i]);
+			if (err)
+				return err;
+
+			nla_nest_end(msg, iftdata);
+		}
+
+		nla_nest_end(msg, nl_iftype_data);
+	}
+
 	/* add bitrates */
 	nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES);
 	if (!nl_rates)
@@ -4472,6 +4528,9 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
 	case RATE_INFO_BW_160:
 		rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH;
 		break;
+	case RATE_INFO_BW_HE_RU:
+		rate_flg = 0;
+		WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS));
 	}
 
 	if (rate_flg && nla_put_flag(msg, rate_flg))
@@ -4491,6 +4550,19 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
 		if (info->flags & RATE_INFO_FLAGS_SHORT_GI &&
 		    nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI))
 			return false;
+	} else if (info->flags & RATE_INFO_FLAGS_HE_MCS) {
+		if (nla_put_u8(msg, NL80211_RATE_INFO_HE_MCS, info->mcs))
+			return false;
+		if (nla_put_u8(msg, NL80211_RATE_INFO_HE_NSS, info->nss))
+			return false;
+		if (nla_put_u8(msg, NL80211_RATE_INFO_HE_GI, info->he_gi))
+			return false;
+		if (nla_put_u8(msg, NL80211_RATE_INFO_HE_DCM, info->he_dcm))
+			return false;
+		if (info->bw == RATE_INFO_BW_HE_RU &&
+		    nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC,
+			       info->he_ru_alloc))
+			return false;
 	}
 
 	nla_nest_end(msg, rate);
@@ -4887,7 +4959,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
 			return -EINVAL;
 		if (params->supported_rates)
 			return -EINVAL;
-		if (params->ext_capab || params->ht_capa || params->vht_capa)
+		if (params->ext_capab || params->ht_capa || params->vht_capa ||
+		    params->he_capa)
 			return -EINVAL;
 	}
 
@@ -5093,6 +5166,15 @@ static int nl80211_set_station_tdls(struct genl_info *info,
 	if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
 		params->vht_capa =
 			nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
+	if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
+		params->he_capa =
+			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+		params->he_capa_len =
+			nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+		if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
+			return -EINVAL;
+	}
 
 	err = nl80211_parse_sta_channel_info(info, params);
 	if (err)
@@ -5320,6 +5402,17 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		params.vht_capa =
 			nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
 
+	if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
+		params.he_capa =
+			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+		params.he_capa_len =
+			nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+		/* max len is validated in nla policy */
+		if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
+			return -EINVAL;
+	}
+
 	if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
 		params.opmode_notif_used = true;
 		params.opmode_notif =
@@ -5352,6 +5445,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) {
 		params.ht_capa = NULL;
 		params.vht_capa = NULL;
+
+		/* HE requires WME */
+		if (params.he_capa_len)
+			return -EINVAL;
 	}
 
 	/* When you run into this, adjust the code below for the new flag */
diff --git a/net/wireless/util.c b/net/wireless/util.c
index b91597a8baa2..4ed06b271f32 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -4,6 +4,7 @@
  *
  * Copyright 2007-2009	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
+ * Copyright 2017	Intel Deutschland GmbH
  */
 #include <linux/export.h>
 #include <linux/bitops.h>
@@ -1142,6 +1143,85 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
 	return 0;
 }
 
+static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
+{
+#define SCALE 2048
+	u16 mcs_divisors[12] = {
+		34133, /* 16.666666... */
+		17067, /*  8.333333... */
+		11378, /*  5.555555... */
+		 8533, /*  4.166666... */
+		 5689, /*  2.777777... */
+		 4267, /*  2.083333... */
+		 3923, /*  1.851851... */
+		 3413, /*  1.666666... */
+		 2844, /*  1.388888... */
+		 2560, /*  1.250000... */
+		 2276, /*  1.111111... */
+		 2048, /*  1.000000... */
+	};
+	u32 rates_160M[3] = { 960777777, 907400000, 816666666 };
+	u32 rates_969[3] =  { 480388888, 453700000, 408333333 };
+	u32 rates_484[3] =  { 229411111, 216666666, 195000000 };
+	u32 rates_242[3] =  { 114711111, 108333333,  97500000 };
+	u32 rates_106[3] =  {  40000000,  37777777,  34000000 };
+	u32 rates_52[3]  =  {  18820000,  17777777,  16000000 };
+	u32 rates_26[3]  =  {   9411111,   8888888,   8000000 };
+	u64 tmp;
+	u32 result;
+
+	if (WARN_ON_ONCE(rate->mcs > 11))
+		return 0;
+
+	if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2))
+		return 0;
+	if (WARN_ON_ONCE(rate->he_ru_alloc >
+			 NL80211_RATE_INFO_HE_RU_ALLOC_2x996))
+		return 0;
+	if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8))
+		return 0;
+
+	if (rate->bw == RATE_INFO_BW_160)
+		result = rates_160M[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_80 ||
+		 (rate->bw == RATE_INFO_BW_HE_RU &&
+		  rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996))
+		result = rates_969[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_40 ||
+		 (rate->bw == RATE_INFO_BW_HE_RU &&
+		  rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484))
+		result = rates_484[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_20 ||
+		 (rate->bw == RATE_INFO_BW_HE_RU &&
+		  rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242))
+		result = rates_242[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_HE_RU &&
+		 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106)
+		result = rates_106[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_HE_RU &&
+		 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52)
+		result = rates_52[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_HE_RU &&
+		 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26)
+		result = rates_26[rate->he_gi];
+	else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n",
+		      rate->bw, rate->he_ru_alloc))
+		return 0;
+
+	/* now scale to the appropriate MCS */
+	tmp = result;
+	tmp *= SCALE;
+	do_div(tmp, mcs_divisors[rate->mcs]);
+	result = tmp;
+
+	/* and take NSS, DCM into account */
+	result = (result * rate->nss) / 8;
+	if (rate->he_dcm)
+		result /= 2;
+
+	return result;
+}
+
 u32 cfg80211_calculate_bitrate(struct rate_info *rate)
 {
 	if (rate->flags & RATE_INFO_FLAGS_MCS)
@@ -1150,6 +1230,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate)
 		return cfg80211_calculate_bitrate_60g(rate);
 	if (rate->flags & RATE_INFO_FLAGS_VHT_MCS)
 		return cfg80211_calculate_bitrate_vht(rate);
+	if (rate->flags & RATE_INFO_FLAGS_HE_MCS)
+		return cfg80211_calculate_bitrate_he(rate);
 
 	return rate->legacy;
 }
-- 
cgit v1.2.3


From 6b293258cded9c8ee44cce4081d9170d6d1b5f5d Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 12 Jun 2018 15:19:11 +1000
Subject: fsi: scom: Major overhaul

This was too hard to split ... this adds a number of features
to the SCOM user interface:

 - Support for indirect SCOMs

 - read()/write() interface now handle errors and retries

 - New ioctl() "raw" interface for use by debuggers

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Reviewed-by: Eddie James <eajames@linux.vnet.ibm.com>
Reviewed-by: Alistair Popple <alistair@popple.id.au>
---
 drivers/fsi/fsi-scom.c   | 424 +++++++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/fsi.h |  58 +++++++
 2 files changed, 452 insertions(+), 30 deletions(-)
 create mode 100644 include/uapi/linux/fsi.h

(limited to 'include/uapi/linux')

diff --git a/drivers/fsi/fsi-scom.c b/drivers/fsi/fsi-scom.c
index e98573ecdae1..39c74351f1bf 100644
--- a/drivers/fsi/fsi-scom.c
+++ b/drivers/fsi/fsi-scom.c
@@ -24,6 +24,8 @@
 #include <linux/list.h>
 #include <linux/idr.h>
 
+#include <uapi/linux/fsi.h>
+
 #define FSI_ENGID_SCOM		0x5
 
 /* SCOM engine register set */
@@ -41,14 +43,36 @@
 /* Status register bits */
 #define SCOM_STATUS_ERR_SUMMARY		0x80000000
 #define SCOM_STATUS_PROTECTION		0x01000000
+#define SCOM_STATUS_PARITY		0x04000000
 #define SCOM_STATUS_PIB_ABORT		0x00100000
 #define SCOM_STATUS_PIB_RESP_MASK	0x00007000
 #define SCOM_STATUS_PIB_RESP_SHIFT	12
 
 #define SCOM_STATUS_ANY_ERR		(SCOM_STATUS_ERR_SUMMARY | \
 					 SCOM_STATUS_PROTECTION | \
+					 SCOM_STATUS_PARITY |	  \
 					 SCOM_STATUS_PIB_ABORT | \
 					 SCOM_STATUS_PIB_RESP_MASK)
+/* SCOM address encodings */
+#define XSCOM_ADDR_IND_FLAG		BIT_ULL(63)
+#define XSCOM_ADDR_INF_FORM1		BIT_ULL(60)
+
+/* SCOM indirect stuff */
+#define XSCOM_ADDR_DIRECT_PART		0x7fffffffull
+#define XSCOM_ADDR_INDIRECT_PART	0x000fffff00000000ull
+#define XSCOM_DATA_IND_READ		BIT_ULL(63)
+#define XSCOM_DATA_IND_COMPLETE		BIT_ULL(31)
+#define XSCOM_DATA_IND_ERR_MASK		0x70000000ull
+#define XSCOM_DATA_IND_ERR_SHIFT	28
+#define XSCOM_DATA_IND_DATA		0x0000ffffull
+#define XSCOM_DATA_IND_FORM1_DATA	0x000fffffffffffffull
+#define XSCOM_ADDR_FORM1_LOW		0x000ffffffffull
+#define XSCOM_ADDR_FORM1_HI		0xfff00000000ull
+#define XSCOM_ADDR_FORM1_HI_SHIFT	20
+
+/* Retries */
+#define SCOM_MAX_RETRIES		100	/* Retries on busy */
+#define SCOM_MAX_IND_RETRIES		10	/* Retries indirect not ready */
 
 struct scom_device {
 	struct list_head link;
@@ -56,7 +80,7 @@ struct scom_device {
 	struct miscdevice mdev;
 	struct mutex lock;
 	char	name[32];
-	int idx;
+	int	idx;
 };
 
 #define to_scom_dev(x)		container_of((x), struct scom_device, mdev)
@@ -65,80 +89,304 @@ static struct list_head scom_devices;
 
 static DEFINE_IDA(scom_ida);
 
-static int put_scom(struct scom_device *scom_dev, uint64_t value,
-		    uint32_t addr)
+static int __put_scom(struct scom_device *scom_dev, uint64_t value,
+		      uint32_t addr, uint32_t *status)
 {
-	__be32 data;
+	__be32 data, raw_status;
 	int rc;
 
-	mutex_lock(&scom_dev->lock);
-
 	data = cpu_to_be32((value >> 32) & 0xffffffff);
 	rc = fsi_device_write(scom_dev->fsi_dev, SCOM_DATA0_REG, &data,
 				sizeof(uint32_t));
 	if (rc)
-		goto bail;
+		return rc;
 
 	data = cpu_to_be32(value & 0xffffffff);
 	rc = fsi_device_write(scom_dev->fsi_dev, SCOM_DATA1_REG, &data,
 				sizeof(uint32_t));
 	if (rc)
-		goto bail;
+		return rc;
 
 	data = cpu_to_be32(SCOM_WRITE_CMD | addr);
 	rc = fsi_device_write(scom_dev->fsi_dev, SCOM_CMD_REG, &data,
 				sizeof(uint32_t));
- bail:
-	mutex_unlock(&scom_dev->lock);
-	return rc;
+	if (rc)
+		return rc;
+	rc = fsi_device_read(scom_dev->fsi_dev, SCOM_STATUS_REG, &raw_status,
+			     sizeof(uint32_t));
+	if (rc)
+		return rc;
+	*status = be32_to_cpu(raw_status);
+
+	return 0;
 }
 
-static int get_scom(struct scom_device *scom_dev, uint64_t *value,
-		    uint32_t addr)
+static int __get_scom(struct scom_device *scom_dev, uint64_t *value,
+		      uint32_t addr, uint32_t *status)
 {
-	__be32 result, data;
+	__be32 data, raw_status;
 	int rc;
 
 
-	mutex_lock(&scom_dev->lock);
 	*value = 0ULL;
 	data = cpu_to_be32(SCOM_READ_CMD | addr);
 	rc = fsi_device_write(scom_dev->fsi_dev, SCOM_CMD_REG, &data,
 				sizeof(uint32_t));
 	if (rc)
-		goto bail;
+		return rc;
+	rc = fsi_device_read(scom_dev->fsi_dev, SCOM_STATUS_REG, &raw_status,
+			     sizeof(uint32_t));
+	if (rc)
+		return rc;
 
-	rc = fsi_device_read(scom_dev->fsi_dev, SCOM_DATA0_REG, &result,
+	/*
+	 * Read the data registers even on error, so we don't have
+	 * to interpret the status register here.
+	 */
+	rc = fsi_device_read(scom_dev->fsi_dev, SCOM_DATA0_REG, &data,
 				sizeof(uint32_t));
 	if (rc)
-		goto bail;
-
-	*value |= (uint64_t)be32_to_cpu(result) << 32;
-	rc = fsi_device_read(scom_dev->fsi_dev, SCOM_DATA1_REG, &result,
+		return rc;
+	*value |= (uint64_t)be32_to_cpu(data) << 32;
+	rc = fsi_device_read(scom_dev->fsi_dev, SCOM_DATA1_REG, &data,
 				sizeof(uint32_t));
 	if (rc)
-		goto bail;
+		return rc;
+	*value |= be32_to_cpu(data);
+	*status = be32_to_cpu(raw_status);
+
+	return rc;
+}
+
+static int put_indirect_scom_form0(struct scom_device *scom, uint64_t value,
+				   uint64_t addr, uint32_t *status)
+{
+	uint64_t ind_data, ind_addr;
+	int rc, retries, err = 0;
+
+	if (value & ~XSCOM_DATA_IND_DATA)
+		return -EINVAL;
+
+	ind_addr = addr & XSCOM_ADDR_DIRECT_PART;
+	ind_data = (addr & XSCOM_ADDR_INDIRECT_PART) | value;
+	rc = __put_scom(scom, ind_data, ind_addr, status);
+	if (rc || (*status & SCOM_STATUS_ANY_ERR))
+		return rc;
+
+	for (retries = 0; retries < SCOM_MAX_IND_RETRIES; retries++) {
+		rc = __get_scom(scom, &ind_data, addr, status);
+		if (rc || (*status & SCOM_STATUS_ANY_ERR))
+			return rc;
+
+		err = (ind_data & XSCOM_DATA_IND_ERR_MASK) >> XSCOM_DATA_IND_ERR_SHIFT;
+		*status = err << SCOM_STATUS_PIB_RESP_SHIFT;
+		if ((ind_data & XSCOM_DATA_IND_COMPLETE) || (err != SCOM_PIB_BLOCKED))
+			return 0;
+
+		msleep(1);
+	}
+	return rc;
+}
+
+static int put_indirect_scom_form1(struct scom_device *scom, uint64_t value,
+				   uint64_t addr, uint32_t *status)
+{
+	uint64_t ind_data, ind_addr;
+
+	if (value & ~XSCOM_DATA_IND_FORM1_DATA)
+		return -EINVAL;
+
+	ind_addr = addr & XSCOM_ADDR_FORM1_LOW;
+	ind_data = value | (addr & XSCOM_ADDR_FORM1_HI) << XSCOM_ADDR_FORM1_HI_SHIFT;
+	return __put_scom(scom, ind_data, ind_addr, status);
+}
+
+static int get_indirect_scom_form0(struct scom_device *scom, uint64_t *value,
+				   uint64_t addr, uint32_t *status)
+{
+	uint64_t ind_data, ind_addr;
+	int rc, retries, err = 0;
+
+	ind_addr = addr & XSCOM_ADDR_DIRECT_PART;
+	ind_data = (addr & XSCOM_ADDR_INDIRECT_PART) | XSCOM_DATA_IND_READ;
+	rc = __put_scom(scom, ind_data, ind_addr, status);
+	if (rc || (*status & SCOM_STATUS_ANY_ERR))
+		return rc;
+
+	for (retries = 0; retries < SCOM_MAX_IND_RETRIES; retries++) {
+		rc = __get_scom(scom, &ind_data, addr, status);
+		if (rc || (*status & SCOM_STATUS_ANY_ERR))
+			return rc;
+
+		err = (ind_data & XSCOM_DATA_IND_ERR_MASK) >> XSCOM_DATA_IND_ERR_SHIFT;
+		*status = err << SCOM_STATUS_PIB_RESP_SHIFT;
+		*value = ind_data & XSCOM_DATA_IND_DATA;
+
+		if ((ind_data & XSCOM_DATA_IND_COMPLETE) || (err != SCOM_PIB_BLOCKED))
+			return 0;
+
+		msleep(1);
+	}
+	return rc;
+}
+
+static int raw_put_scom(struct scom_device *scom, uint64_t value,
+			uint64_t addr, uint32_t *status)
+{
+	if (addr & XSCOM_ADDR_IND_FLAG) {
+		if (addr & XSCOM_ADDR_INF_FORM1)
+			return put_indirect_scom_form1(scom, value, addr, status);
+		else
+			return put_indirect_scom_form0(scom, value, addr, status);
+	} else
+		return __put_scom(scom, value, addr, status);
+}
+
+static int raw_get_scom(struct scom_device *scom, uint64_t *value,
+			uint64_t addr, uint32_t *status)
+{
+	if (addr & XSCOM_ADDR_IND_FLAG) {
+		if (addr & XSCOM_ADDR_INF_FORM1)
+			return -ENXIO;
+		return get_indirect_scom_form0(scom, value, addr, status);
+	} else
+		return __get_scom(scom, value, addr, status);
+}
+
+static int handle_fsi2pib_status(struct scom_device *scom, uint32_t status)
+{
+	uint32_t dummy = -1;
+
+	if (status & SCOM_STATUS_PROTECTION)
+		return -EPERM;
+	if (status & SCOM_STATUS_PARITY) {
+		fsi_device_write(scom->fsi_dev, SCOM_FSI2PIB_RESET_REG, &dummy,
+				 sizeof(uint32_t));
+		return -EIO;
+	}
+	/* Return -EBUSY on PIB abort to force a retry */
+	if (status & SCOM_STATUS_PIB_ABORT)
+		return -EBUSY;
+	if (status & SCOM_STATUS_ERR_SUMMARY) {
+		fsi_device_write(scom->fsi_dev, SCOM_FSI2PIB_RESET_REG, &dummy,
+				 sizeof(uint32_t));
+		return -EIO;
+	}
+	return 0;
+}
+
+static int handle_pib_status(struct scom_device *scom, uint8_t status)
+{
+	uint32_t dummy = -1;
+
+	if (status == SCOM_PIB_SUCCESS)
+		return 0;
+	if (status == SCOM_PIB_BLOCKED)
+		return -EBUSY;
+
+	/* Reset the bridge */
+	fsi_device_write(scom->fsi_dev, SCOM_FSI2PIB_RESET_REG, &dummy,
+			 sizeof(uint32_t));
+
+	switch(status) {
+	case SCOM_PIB_OFFLINE:
+		return -ENODEV;
+	case SCOM_PIB_BAD_ADDR:
+		return -ENXIO;
+	case SCOM_PIB_TIMEOUT:
+		return -ETIMEDOUT;
+	case SCOM_PIB_PARTIAL:
+	case SCOM_PIB_CLK_ERR:
+	case SCOM_PIB_PARITY_ERR:
+	default:
+		return -EIO;
+	}
+}
 
-	*value |= be32_to_cpu(result);
- bail:
-	mutex_unlock(&scom_dev->lock);
+static int put_scom(struct scom_device *scom, uint64_t value,
+		    uint64_t addr)
+{
+	uint32_t status, dummy = -1;
+	int rc, retries;
+
+	for (retries = 0; retries < SCOM_MAX_RETRIES; retries++) {
+		rc = raw_put_scom(scom, value, addr, &status);
+		if (rc) {
+			/* Try resetting the bridge if FSI fails */
+			if (rc != -ENODEV && retries == 0) {
+				fsi_device_write(scom->fsi_dev, SCOM_FSI2PIB_RESET_REG,
+						 &dummy, sizeof(uint32_t));
+				rc = -EBUSY;
+			} else
+				return rc;
+		} else
+			rc = handle_fsi2pib_status(scom, status);
+		if (rc && rc != -EBUSY)
+			break;
+		if (rc == 0) {
+			rc = handle_pib_status(scom,
+					       (status & SCOM_STATUS_PIB_RESP_MASK)
+					       >> SCOM_STATUS_PIB_RESP_SHIFT);
+			if (rc && rc != -EBUSY)
+				break;
+		}
+		if (rc == 0)
+			break;
+		msleep(1);
+	}
+	return rc;
+}
+
+static int get_scom(struct scom_device *scom, uint64_t *value,
+		    uint64_t addr)
+{
+	uint32_t status, dummy = -1;
+	int rc, retries;
+
+	for (retries = 0; retries < SCOM_MAX_RETRIES; retries++) {
+		rc = raw_get_scom(scom, value, addr, &status);
+		if (rc) {
+			/* Try resetting the bridge if FSI fails */
+			if (rc != -ENODEV && retries == 0) {
+				fsi_device_write(scom->fsi_dev, SCOM_FSI2PIB_RESET_REG,
+						 &dummy, sizeof(uint32_t));
+				rc = -EBUSY;
+			} else
+				return rc;
+		} else
+			rc = handle_fsi2pib_status(scom, status);
+		if (rc && rc != -EBUSY)
+			break;
+		if (rc == 0) {
+			rc = handle_pib_status(scom,
+					       (status & SCOM_STATUS_PIB_RESP_MASK)
+					       >> SCOM_STATUS_PIB_RESP_SHIFT);
+			if (rc && rc != -EBUSY)
+				break;
+		}
+		if (rc == 0)
+			break;
+		msleep(1);
+	}
 	return rc;
 }
 
 static ssize_t scom_read(struct file *filep, char __user *buf, size_t len,
 			 loff_t *offset)
 {
-	int rc;
 	struct miscdevice *mdev =
 				(struct miscdevice *)filep->private_data;
 	struct scom_device *scom = to_scom_dev(mdev);
 	struct device *dev = &scom->fsi_dev->dev;
 	uint64_t val;
+	int rc;
 
 	if (len != sizeof(uint64_t))
 		return -EINVAL;
 
+	mutex_lock(&scom->lock);
 	rc = get_scom(scom, &val, *offset);
+	mutex_unlock(&scom->lock);
 	if (rc) {
 		dev_dbg(dev, "get_scom fail:%d\n", rc);
 		return rc;
@@ -169,7 +417,9 @@ static ssize_t scom_write(struct file *filep, const char __user *buf,
 		return -EINVAL;
 	}
 
+	mutex_lock(&scom->lock);
 	rc = put_scom(scom, val, *offset);
+	mutex_unlock(&scom->lock);
 	if (rc) {
 		dev_dbg(dev, "put_scom failed with:%d\n", rc);
 		return rc;
@@ -193,11 +443,125 @@ static loff_t scom_llseek(struct file *file, loff_t offset, int whence)
 	return offset;
 }
 
+static void raw_convert_status(struct scom_access *acc, uint32_t status)
+{
+	acc->pib_status = (status & SCOM_STATUS_PIB_RESP_MASK) >>
+		SCOM_STATUS_PIB_RESP_SHIFT;
+	acc->intf_errors = 0;
+
+	if (status & SCOM_STATUS_PROTECTION)
+		acc->intf_errors |= SCOM_INTF_ERR_PROTECTION;
+	else if (status & SCOM_STATUS_PARITY)
+		acc->intf_errors |= SCOM_INTF_ERR_PARITY;
+	else if (status & SCOM_STATUS_PIB_ABORT)
+		acc->intf_errors |= SCOM_INTF_ERR_ABORT;
+	else if (status & SCOM_STATUS_ERR_SUMMARY)
+		acc->intf_errors |= SCOM_INTF_ERR_UNKNOWN;
+}
+
+static int scom_raw_read(struct scom_device *scom, void __user *argp)
+{
+	struct scom_access acc;
+	uint32_t status;
+	int rc;
+
+	if (copy_from_user(&acc, argp, sizeof(struct scom_access)))
+		return -EFAULT;
+
+	rc = raw_get_scom(scom, &acc.data, acc.addr, &status);
+	if (rc)
+		return rc;
+	raw_convert_status(&acc, status);
+	if (copy_to_user(argp, &acc, sizeof(struct scom_access)))
+		return -EFAULT;
+	return 0;
+}
+
+static int scom_raw_write(struct scom_device *scom, void __user *argp)
+{
+	u64 prev_data, mask, data;
+	struct scom_access acc;
+	uint32_t status;
+	int rc;
+
+	if (copy_from_user(&acc, argp, sizeof(struct scom_access)))
+		return -EFAULT;
+
+	if (acc.mask) {
+		rc = raw_get_scom(scom, &prev_data, acc.addr, &status);
+		if (rc)
+			return rc;
+		if (status & SCOM_STATUS_ANY_ERR)
+			goto fail;
+		mask = acc.mask;
+	} else {
+		prev_data = mask = -1ull;
+	}
+	data = (prev_data & ~mask) | (acc.data & mask);
+	rc = raw_put_scom(scom, data, acc.addr, &status);
+	if (rc)
+		return rc;
+ fail:
+	raw_convert_status(&acc, status);
+	if (copy_to_user(argp, &acc, sizeof(struct scom_access)))
+		return -EFAULT;
+	return 0;
+}
+
+static int scom_reset(struct scom_device *scom, void __user *argp)
+{
+	uint32_t flags, dummy = -1;
+	int rc = 0;
+
+	if (get_user(flags, (__u32 __user *)argp))
+		return -EFAULT;
+	if (flags & SCOM_RESET_PIB)
+		rc = fsi_device_write(scom->fsi_dev, SCOM_PIB_RESET_REG, &dummy,
+				      sizeof(uint32_t));
+	if (!rc && (flags & (SCOM_RESET_PIB | SCOM_RESET_INTF)))
+		rc = fsi_device_write(scom->fsi_dev, SCOM_FSI2PIB_RESET_REG, &dummy,
+				      sizeof(uint32_t));
+	return rc;
+}
+
+static int scom_check(struct scom_device *scom, void __user *argp)
+{
+	/* Still need to find out how to get "protected" */
+	return put_user(SCOM_CHECK_SUPPORTED, (__u32 __user *)argp);
+}
+
+static long scom_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct miscdevice *mdev = file->private_data;
+	struct scom_device *scom = to_scom_dev(mdev);
+	void __user *argp = (void __user *)arg;
+	int rc = -ENOTTY;
+
+	mutex_lock(&scom->lock);
+	switch(cmd) {
+	case FSI_SCOM_CHECK:
+		rc = scom_check(scom, argp);
+		break;
+	case FSI_SCOM_READ:
+		rc = scom_raw_read(scom, argp);
+		break;
+	case FSI_SCOM_WRITE:
+		rc = scom_raw_write(scom, argp);
+		break;
+	case FSI_SCOM_RESET:
+		rc = scom_reset(scom, argp);
+		break;
+	}
+	mutex_unlock(&scom->lock);
+	return rc;
+}
+
 static const struct file_operations scom_fops = {
-	.owner	= THIS_MODULE,
-	.llseek	= scom_llseek,
-	.read	= scom_read,
-	.write	= scom_write,
+	.owner		= THIS_MODULE,
+	.llseek		= scom_llseek,
+	.read		= scom_read,
+	.write		= scom_write,
+	.unlocked_ioctl	= scom_ioctl,
 };
 
 static int scom_probe(struct device *dev)
diff --git a/include/uapi/linux/fsi.h b/include/uapi/linux/fsi.h
new file mode 100644
index 000000000000..da577ecd90e7
--- /dev/null
+++ b/include/uapi/linux/fsi.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_FSI_H
+#define _UAPI_LINUX_FSI_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * /dev/scom "raw" ioctl interface
+ *
+ * The driver supports a high level "read/write" interface which
+ * handles retries and converts the status to Linux error codes,
+ * however low level tools an debugger need to access the "raw"
+ * HW status information and interpret it themselves, so this
+ * ioctl interface is also provided for their use case.
+ */
+
+/* Structure for SCOM read/write */
+struct scom_access {
+	__u64	addr;		/* SCOM address, supports indirect */
+	__u64	data;		/* SCOM data (in for write, out for read) */
+	__u64	mask;		/* Data mask for writes */
+	__u32	intf_errors;	/* Interface error flags */
+#define SCOM_INTF_ERR_PARITY		0x00000001 /* Parity error */
+#define SCOM_INTF_ERR_PROTECTION	0x00000002 /* Blocked by secure boot */
+#define SCOM_INTF_ERR_ABORT		0x00000004 /* PIB reset during access */
+#define SCOM_INTF_ERR_UNKNOWN		0x80000000 /* Unknown error */
+	/*
+	 * Note: Any other bit set in intf_errors need to be considered as an
+	 * error. Future implementations may define new error conditions. The
+	 * pib_status below is only valid if intf_errors is 0.
+	 */
+	__u8	pib_status;	/* 3-bit PIB status */
+#define SCOM_PIB_SUCCESS	0	/* Access successful */
+#define SCOM_PIB_BLOCKED	1	/* PIB blocked, pls retry */
+#define SCOM_PIB_OFFLINE	2	/* Chiplet offline */
+#define SCOM_PIB_PARTIAL	3	/* Partial good */
+#define SCOM_PIB_BAD_ADDR	4	/* Invalid address */
+#define SCOM_PIB_CLK_ERR	5	/* Clock error */
+#define SCOM_PIB_PARITY_ERR	6	/* Parity error on the PIB bus */
+#define SCOM_PIB_TIMEOUT	7	/* Bus timeout */
+	__u8	pad;
+};
+
+/* Flags for SCOM check */
+#define SCOM_CHECK_SUPPORTED	0x00000001	/* Interface supported */
+#define SCOM_CHECK_PROTECTED	0x00000002	/* Interface blocked by secure boot */
+
+/* Flags for SCOM reset */
+#define SCOM_RESET_INTF		0x00000001	/* Reset interface */
+#define SCOM_RESET_PIB		0x00000002	/* Reset PIB */
+
+#define FSI_SCOM_CHECK	_IOR('s', 0x00, __u32)
+#define FSI_SCOM_READ	_IOWR('s', 0x01, struct scom_access)
+#define FSI_SCOM_WRITE	_IOWR('s', 0x02, struct scom_access)
+#define FSI_SCOM_RESET	_IOW('s', 0x03, __u32)
+
+#endif /* _UAPI_LINUX_FSI_H */
-- 
cgit v1.2.3


From d904ac0320d3c4ff4e9d80e4294ca5dde803696f Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 5 Jun 2018 11:45:07 -0400
Subject: audit: rename FILTER_TYPE to FILTER_EXCLUDE

The AUDIT_FILTER_TYPE name is vague and misleading due to not describing
where or when the filter is applied and obsolete due to its available
filter fields having been expanded.

Userspace has already renamed it from AUDIT_FILTER_TYPE to
AUDIT_FILTER_EXCLUDE without checking if it already exists.  The
userspace maintainer assures that as long as it is set to the same value
it will not be a problem since the userspace code does not treat
compiler warnings as errors.  If this policy changes then checks if it
already exists can be added at the same time.

See: https://github.com/linux-audit/audit-kernel/issues/89

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/uapi/linux/audit.h |  3 ++-
 kernel/audit.c             |  2 +-
 kernel/auditfilter.c       | 10 +++++-----
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index c35aee9ad4a6..4e3eaba84175 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -157,7 +157,8 @@
 #define AUDIT_FILTER_ENTRY	0x02	/* Apply rule at syscall entry */
 #define AUDIT_FILTER_WATCH	0x03	/* Apply rule to file system watches */
 #define AUDIT_FILTER_EXIT	0x04	/* Apply rule at syscall exit */
-#define AUDIT_FILTER_TYPE	0x05	/* Apply rule at audit_log_start */
+#define AUDIT_FILTER_EXCLUDE	0x05	/* Apply rule before record creation */
+#define AUDIT_FILTER_TYPE	AUDIT_FILTER_EXCLUDE /* obsolete misleading naming */
 #define AUDIT_FILTER_FS		0x06	/* Apply rule at __audit_inode_child */
 
 #define AUDIT_NR_FILTERS	7
diff --git a/kernel/audit.c b/kernel/audit.c
index e7478cb58079..5c0a1d7b0c7b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1754,7 +1754,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	if (audit_initialized != AUDIT_INITIALIZED)
 		return NULL;
 
-	if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
+	if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE)))
 		return NULL;
 
 	/* NOTE: don't ever fail/sleep on these two conditions:
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6db9847ca031..bf309f2592c4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -264,7 +264,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
 	case AUDIT_FILTER_TASK:
 #endif
 	case AUDIT_FILTER_USER:
-	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_EXCLUDE:
 	case AUDIT_FILTER_FS:
 		;
 	}
@@ -337,7 +337,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 {
 	switch(f->type) {
 	case AUDIT_MSGTYPE:
-		if (entry->rule.listnr != AUDIT_FILTER_TYPE &&
+		if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE &&
 		    entry->rule.listnr != AUDIT_FILTER_USER)
 			return -EINVAL;
 		break;
@@ -929,7 +929,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
 	/* If any of these, don't count towards total */
 	switch(entry->rule.listnr) {
 	case AUDIT_FILTER_USER:
-	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_EXCLUDE:
 	case AUDIT_FILTER_FS:
 		dont_count = 1;
 	}
@@ -1011,7 +1011,7 @@ int audit_del_rule(struct audit_entry *entry)
 	/* If any of these, don't count towards total */
 	switch(entry->rule.listnr) {
 	case AUDIT_FILTER_USER:
-	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_EXCLUDE:
 	case AUDIT_FILTER_FS:
 		dont_count = 1;
 	}
@@ -1372,7 +1372,7 @@ int audit_filter(int msgtype, unsigned int listtype)
 				break;
 		}
 		if (result > 0) {
-			if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE)
+			if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_EXCLUDE)
 				ret = 0;
 			break;
 		}
-- 
cgit v1.2.3


From 9b42c1f179a614e11893ae4619f0304a38f481ae Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 12 Jun 2018 12:44:26 +0200
Subject: xfrm: Extend the output_mark to support input direction and masking.

We already support setting an output mark at the xfrm_state,
unfortunately this does not support the input direction and
masking the marks that will be applied to the skb. This change
adds support applying a masked value in both directions.

The existing XFRMA_OUTPUT_MARK number is reused for this purpose
and as it is now bi-directional, it is renamed to XFRMA_SET_MARK.

An additional XFRMA_SET_MARK_MASK attribute is added for setting the
mask. If the attribute mask not provided, it is set to 0xffffffff,
keeping the XFRMA_OUTPUT_MARK existing 'full mask' semantics.

Co-developed-by: Tobias Brunner <tobias@strongswan.org>
Co-developed-by: Eyal Birger <eyal.birger@gmail.com>
Co-developed-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Tobias Brunner <tobias@strongswan.org>
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
---
 include/net/xfrm.h        |  9 ++++++++-
 include/uapi/linux/xfrm.h |  4 +++-
 net/xfrm/xfrm_device.c    |  3 ++-
 net/xfrm/xfrm_input.c     |  2 ++
 net/xfrm/xfrm_output.c    |  3 +--
 net/xfrm/xfrm_policy.c    |  5 +++--
 net/xfrm/xfrm_user.c      | 48 +++++++++++++++++++++++++++++++++++++----------
 7 files changed, 57 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 557122846e0e..3dc83ba26f62 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -166,7 +166,7 @@ struct xfrm_state {
 		int		header_len;
 		int		trailer_len;
 		u32		extra_flags;
-		u32		output_mark;
+		struct xfrm_mark	smark;
 	} props;
 
 	struct xfrm_lifetime_cfg lft;
@@ -2012,6 +2012,13 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m)
 	return ret;
 }
 
+static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x)
+{
+	struct xfrm_mark *m = &x->props.smark;
+
+	return (m->v & m->m) | (mark & ~m->m);
+}
+
 static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
 				    unsigned int family)
 {
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index e3af2859188b..5a6ed7ce5a29 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -305,9 +305,11 @@ enum xfrm_attr_type_t {
 	XFRMA_ADDRESS_FILTER,	/* struct xfrm_address_filter */
 	XFRMA_PAD,
 	XFRMA_OFFLOAD_DEV,	/* struct xfrm_state_offload */
-	XFRMA_OUTPUT_MARK,	/* __u32 */
+	XFRMA_SET_MARK,		/* __u32 */
+	XFRMA_SET_MARK_MASK,	/* __u32 */
 	__XFRMA_MAX
 
+#define XFRMA_OUTPUT_MARK XFRMA_SET_MARK	/* Compatibility */
 #define XFRMA_MAX (__XFRMA_MAX - 1)
 };
 
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 175941e15a6e..16c1230d20fa 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -162,7 +162,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		}
 
 		dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr,
-					x->props.family, x->props.output_mark);
+					x->props.family,
+					xfrm_smark_get(0, x));
 		if (IS_ERR(dst))
 			return 0;
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 352abca2605f..074810436242 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -339,6 +339,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 			goto drop;
 		}
 
+		skb->mark = xfrm_smark_get(skb->mark, x);
+
 		skb->sp->xvec[skb->sp->len++] = x;
 
 lock:
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 89b178a78dc7..45ba07ab3e4f 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -66,8 +66,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
 			goto error_nolock;
 		}
 
-		if (x->props.output_mark)
-			skb->mark = x->props.output_mark;
+		skb->mark = xfrm_smark_get(skb->mark, x);
 
 		err = x->outer_mode->output(x, skb);
 		if (err) {
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5f48251c1319..7637637717ec 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1607,10 +1607,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		dst_copy_metrics(dst1, dst);
 
 		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
+			__u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
+
 			family = xfrm[i]->props.family;
 			dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
-					      &saddr, &daddr, family,
-					      xfrm[i]->props.output_mark);
+					      &saddr, &daddr, family, mark);
 			err = PTR_ERR(dst);
 			if (IS_ERR(dst))
 				goto put_states;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 080035f056d9..9602cc9e05ab 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -527,6 +527,19 @@ static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs,
 		x->replay_maxdiff = nla_get_u32(rt);
 }
 
+static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m)
+{
+	if (attrs[XFRMA_SET_MARK]) {
+		m->v = nla_get_u32(attrs[XFRMA_SET_MARK]);
+		if (attrs[XFRMA_SET_MARK_MASK])
+			m->m = nla_get_u32(attrs[XFRMA_SET_MARK_MASK]);
+		else
+			m->m = 0xffffffff;
+	} else {
+		m->v = m->m = 0;
+	}
+}
+
 static struct xfrm_state *xfrm_state_construct(struct net *net,
 					       struct xfrm_usersa_info *p,
 					       struct nlattr **attrs,
@@ -579,8 +592,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 
 	xfrm_mark_get(attrs, &x->mark);
 
-	if (attrs[XFRMA_OUTPUT_MARK])
-		x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]);
+	xfrm_smark_init(attrs, &x->props.smark);
 
 	err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
 	if (err)
@@ -824,6 +836,18 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
 	return 0;
 }
 
+static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m)
+{
+	int ret = 0;
+
+	if (m->v | m->m) {
+		ret = nla_put_u32(skb, XFRMA_SET_MARK, m->v);
+		if (!ret)
+			ret = nla_put_u32(skb, XFRMA_SET_MARK_MASK, m->m);
+	}
+	return ret;
+}
+
 /* Don't change this without updating xfrm_sa_len! */
 static int copy_to_user_state_extra(struct xfrm_state *x,
 				    struct xfrm_usersa_info *p,
@@ -887,6 +911,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 	ret = xfrm_mark_put(skb, &x->mark);
 	if (ret)
 		goto out;
+
+	ret = xfrm_smark_put(skb, &x->props.smark);
+	if (ret)
+		goto out;
+
 	if (x->replay_esn)
 		ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL,
 			      xfrm_replay_state_esn_len(x->replay_esn),
@@ -900,11 +929,7 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 		ret = copy_user_offload(&x->xso, skb);
 	if (ret)
 		goto out;
-	if (x->props.output_mark) {
-		ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark);
-		if (ret)
-			goto out;
-	}
+
 	if (x->security)
 		ret = copy_sec_ctx(x->security, skb);
 out:
@@ -2493,7 +2518,8 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_PROTO]		= { .type = NLA_U8 },
 	[XFRMA_ADDRESS_FILTER]	= { .len = sizeof(struct xfrm_address_filter) },
 	[XFRMA_OFFLOAD_DEV]	= { .len = sizeof(struct xfrm_user_offload) },
-	[XFRMA_OUTPUT_MARK]	= { .type = NLA_U32 },
+	[XFRMA_SET_MARK]	= { .type = NLA_U32 },
+	[XFRMA_SET_MARK_MASK]	= { .type = NLA_U32 },
 };
 
 static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@ -2719,8 +2745,10 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
 		l += nla_total_size(sizeof(x->props.extra_flags));
 	if (x->xso.dev)
 		 l += nla_total_size(sizeof(x->xso));
-	if (x->props.output_mark)
-		l += nla_total_size(sizeof(x->props.output_mark));
+	if (x->props.smark.v | x->props.smark.m) {
+		l += nla_total_size(sizeof(x->props.smark.v));
+		l += nla_total_size(sizeof(x->props.smark.m));
+	}
 
 	/* Must count x->lastused as it may become non-zero behind our back. */
 	l += nla_total_size_64bit(sizeof(u64));
-- 
cgit v1.2.3


From 7e6526404adedf079279aa7aa11722deaca8fe2e Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 12 Jun 2018 14:07:07 +0200
Subject: xfrm: Add a new lookup key to match xfrm interfaces.

This patch adds the xfrm interface id as a lookup key
for xfrm states and policies. With this we can assign
states and policies to virtual xfrm interfaces.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Acked-by: Shannon Nelson <shannon.nelson@oracle.com>
Acked-by: Benedict Wong <benedictwong@google.com>
Tested-by: Benedict Wong <benedictwong@google.com>
Tested-by: Antony Antony <antony@phenome.org>
Reviewed-by: Eyal Birger <eyal.birger@gmail.com>
---
 include/net/xfrm.h        | 21 +++++++++++++-----
 include/uapi/linux/xfrm.h |  1 +
 net/core/pktgen.c         |  2 +-
 net/key/af_key.c          |  6 +++---
 net/xfrm/xfrm_policy.c    | 18 +++++++++++-----
 net/xfrm/xfrm_state.c     | 19 ++++++++++++-----
 net/xfrm/xfrm_user.c      | 54 +++++++++++++++++++++++++++++++++++++++++------
 7 files changed, 96 insertions(+), 25 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 3dc83ba26f62..e8bada4d2a45 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -147,6 +147,7 @@ struct xfrm_state {
 	struct xfrm_id		id;
 	struct xfrm_selector	sel;
 	struct xfrm_mark	mark;
+	u32			if_id;
 	u32			tfcpad;
 
 	u32			genid;
@@ -574,6 +575,7 @@ struct xfrm_policy {
 	atomic_t		genid;
 	u32			priority;
 	u32			index;
+	u32			if_id;
 	struct xfrm_mark	mark;
 	struct xfrm_selector	selector;
 	struct xfrm_lifetime_cfg lft;
@@ -1533,7 +1535,7 @@ struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr,
 				   struct xfrm_tmpl *tmpl,
 				   struct xfrm_policy *pol, int *err,
 				   unsigned short family);
-struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark,
+struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
 				       xfrm_address_t *daddr,
 				       xfrm_address_t *saddr,
 				       unsigned short family,
@@ -1690,20 +1692,20 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
 		     void *);
 void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net);
 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
-struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark,
+struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
 					  u8 type, int dir,
 					  struct xfrm_selector *sel,
 					  struct xfrm_sec_ctx *ctx, int delete,
 					  int *err);
-struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir,
-				     u32 id, int delete, int *err);
+struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id, u8,
+				     int dir, u32 id, int delete, int *err);
 int xfrm_policy_flush(struct net *net, u8 type, bool task_valid);
 void xfrm_policy_hash_rebuild(struct net *net);
 u32 xfrm_get_acqseq(void);
 int verify_spi_info(u8 proto, u32 min, u32 max);
 int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi);
 struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark,
-				 u8 mode, u32 reqid, u8 proto,
+				 u8 mode, u32 reqid, u32 if_id, u8 proto,
 				 const xfrm_address_t *daddr,
 				 const xfrm_address_t *saddr, int create,
 				 unsigned short family);
@@ -2019,6 +2021,15 @@ static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x)
 	return (m->v & m->m) | (mark & ~m->m);
 }
 
+static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id)
+{
+	int ret = 0;
+
+	if (if_id)
+		ret = nla_put_u32(skb, XFRMA_IF_ID, if_id);
+	return ret;
+}
+
 static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
 				    unsigned int family)
 {
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 5a6ed7ce5a29..5f3b9fec7b5f 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -307,6 +307,7 @@ enum xfrm_attr_type_t {
 	XFRMA_OFFLOAD_DEV,	/* struct xfrm_state_offload */
 	XFRMA_SET_MARK,		/* __u32 */
 	XFRMA_SET_MARK_MASK,	/* __u32 */
+	XFRMA_IF_ID,		/* __u32 */
 	__XFRMA_MAX
 
 #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK	/* Compatibility */
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 49368e21d228..6d37dbf0aa64 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2255,7 +2255,7 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
 			x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
 		} else {
 			/* slow path: we dont already have xfrm_state */
-			x = xfrm_stateonly_find(pn->net, DUMMY_MARK,
+			x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0,
 						(xfrm_address_t *)&pkt_dev->cur_daddr,
 						(xfrm_address_t *)&pkt_dev->cur_saddr,
 						AF_INET,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 8bdc1cbe490a..398ebcd614a0 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1383,7 +1383,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
 	}
 
 	if (!x)
-		x = xfrm_find_acq(net, &dummy_mark, mode, reqid, proto, xdaddr, xsaddr, 1, family);
+		x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family);
 
 	if (x == NULL)
 		return -ENOENT;
@@ -2414,7 +2414,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa
 			return err;
 	}
 
-	xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+	xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
 				   pol->sadb_x_policy_dir - 1, &sel, pol_ctx,
 				   1, &err);
 	security_xfrm_policy_free(pol_ctx);
@@ -2663,7 +2663,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_
 		return -EINVAL;
 
 	delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2);
-	xp = xfrm_policy_byid(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+	xp = xfrm_policy_byid(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
 			      dir, pol->sadb_x_policy_id, delete, &err);
 	if (xp == NULL)
 		return -ENOENT;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7637637717ec..fc0c69312b2c 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -747,6 +747,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	newpos = NULL;
 	hlist_for_each_entry(pol, chain, bydst) {
 		if (pol->type == policy->type &&
+		    pol->if_id == policy->if_id &&
 		    !selector_cmp(&pol->selector, &policy->selector) &&
 		    xfrm_policy_mark_match(policy, pol) &&
 		    xfrm_sec_ctx_match(pol->security, policy->security) &&
@@ -798,8 +799,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 }
 EXPORT_SYMBOL(xfrm_policy_insert);
 
-struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
-					  int dir, struct xfrm_selector *sel,
+struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
+					  u8 type, int dir,
+					  struct xfrm_selector *sel,
 					  struct xfrm_sec_ctx *ctx, int delete,
 					  int *err)
 {
@@ -812,6 +814,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	ret = NULL;
 	hlist_for_each_entry(pol, chain, bydst) {
 		if (pol->type == type &&
+		    pol->if_id == if_id &&
 		    (mark & pol->mark.m) == pol->mark.v &&
 		    !selector_cmp(sel, &pol->selector) &&
 		    xfrm_sec_ctx_match(ctx, pol->security)) {
@@ -837,8 +840,9 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
 
-struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
-				     int dir, u32 id, int delete, int *err)
+struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id,
+				     u8 type, int dir, u32 id, int delete,
+				     int *err)
 {
 	struct xfrm_policy *pol, *ret;
 	struct hlist_head *chain;
@@ -853,6 +857,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	ret = NULL;
 	hlist_for_each_entry(pol, chain, byidx) {
 		if (pol->type == type && pol->index == id &&
+		    pol->if_id == if_id &&
 		    (mark & pol->mark.m) == pol->mark.v) {
 			xfrm_pol_hold(pol);
 			if (delete) {
@@ -1063,6 +1068,7 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
 	bool match;
 
 	if (pol->family != family ||
+	    pol->if_id != fl->flowi_xfrm.if_id ||
 	    (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
 	    pol->type != type)
 		return ret;
@@ -1177,7 +1183,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 
 		match = xfrm_selector_match(&pol->selector, fl, family);
 		if (match) {
-			if ((sk->sk_mark & pol->mark.m) != pol->mark.v) {
+			if ((sk->sk_mark & pol->mark.m) != pol->mark.v ||
+			    pol->if_id != fl->flowi_xfrm.if_id) {
 				pol = NULL;
 				goto out;
 			}
@@ -1305,6 +1312,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
 		newp->lft = old->lft;
 		newp->curlft = old->curlft;
 		newp->mark = old->mark;
+		newp->if_id = old->if_id;
 		newp->action = old->action;
 		newp->flags = old->flags;
 		newp->xfrm_nr = old->xfrm_nr;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 8308281f3253..3803b6813fc5 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -941,6 +941,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 	int error = 0;
 	struct xfrm_state *best = NULL;
 	u32 mark = pol->mark.v & pol->mark.m;
+	u32 if_id = fl->flowi_xfrm.if_id;
 	unsigned short encap_family = tmpl->encap_family;
 	unsigned int sequence;
 	struct km_event c;
@@ -955,6 +956,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 		if (x->props.family == encap_family &&
 		    x->props.reqid == tmpl->reqid &&
 		    (mark & x->mark.m) == x->mark.v &&
+		    x->if_id == if_id &&
 		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
 		    xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
 		    tmpl->mode == x->props.mode &&
@@ -971,6 +973,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 		if (x->props.family == encap_family &&
 		    x->props.reqid == tmpl->reqid &&
 		    (mark & x->mark.m) == x->mark.v &&
+		    x->if_id == if_id &&
 		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
 		    xfrm_addr_equal(&x->id.daddr, daddr, encap_family) &&
 		    tmpl->mode == x->props.mode &&
@@ -1010,6 +1013,7 @@ found:
 		 * to current session. */
 		xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
 		memcpy(&x->mark, &pol->mark, sizeof(x->mark));
+		x->if_id = if_id;
 
 		error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid);
 		if (error) {
@@ -1067,7 +1071,7 @@ out:
 }
 
 struct xfrm_state *
-xfrm_stateonly_find(struct net *net, u32 mark,
+xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
 		    xfrm_address_t *daddr, xfrm_address_t *saddr,
 		    unsigned short family, u8 mode, u8 proto, u32 reqid)
 {
@@ -1080,6 +1084,7 @@ xfrm_stateonly_find(struct net *net, u32 mark,
 		if (x->props.family == family &&
 		    x->props.reqid == reqid &&
 		    (mark & x->mark.m) == x->mark.v &&
+		    x->if_id == if_id &&
 		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
 		    xfrm_state_addr_check(x, daddr, saddr, family) &&
 		    mode == x->props.mode &&
@@ -1160,11 +1165,13 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
 	struct xfrm_state *x;
 	unsigned int h;
 	u32 mark = xnew->mark.v & xnew->mark.m;
+	u32 if_id = xnew->if_id;
 
 	h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family);
 	hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
 		if (x->props.family	== family &&
 		    x->props.reqid	== reqid &&
+		    x->if_id		== if_id &&
 		    (mark & x->mark.m) == x->mark.v &&
 		    xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) &&
 		    xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family))
@@ -1187,7 +1194,7 @@ EXPORT_SYMBOL(xfrm_state_insert);
 static struct xfrm_state *__find_acq_core(struct net *net,
 					  const struct xfrm_mark *m,
 					  unsigned short family, u8 mode,
-					  u32 reqid, u8 proto,
+					  u32 reqid, u32 if_id, u8 proto,
 					  const xfrm_address_t *daddr,
 					  const xfrm_address_t *saddr,
 					  int create)
@@ -1242,6 +1249,7 @@ static struct xfrm_state *__find_acq_core(struct net *net,
 		x->props.family = family;
 		x->props.mode = mode;
 		x->props.reqid = reqid;
+		x->if_id = if_id;
 		x->mark.v = m->v;
 		x->mark.m = m->m;
 		x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
@@ -1296,7 +1304,7 @@ int xfrm_state_add(struct xfrm_state *x)
 
 	if (use_spi && !x1)
 		x1 = __find_acq_core(net, &x->mark, family, x->props.mode,
-				     x->props.reqid, x->id.proto,
+				     x->props.reqid, x->if_id, x->id.proto,
 				     &x->id.daddr, &x->props.saddr, 0);
 
 	__xfrm_state_bump_genids(x);
@@ -1395,6 +1403,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
 	x->props.flags = orig->props.flags;
 	x->props.extra_flags = orig->props.extra_flags;
 
+	x->if_id = orig->if_id;
 	x->tfcpad = orig->tfcpad;
 	x->replay_maxdiff = orig->replay_maxdiff;
 	x->replay_maxage = orig->replay_maxage;
@@ -1619,13 +1628,13 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
 
 struct xfrm_state *
 xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
-	      u8 proto, const xfrm_address_t *daddr,
+	      u32 if_id, u8 proto, const xfrm_address_t *daddr,
 	      const xfrm_address_t *saddr, int create, unsigned short family)
 {
 	struct xfrm_state *x;
 
 	spin_lock_bh(&net->xfrm.xfrm_state_lock);
-	x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create);
+	x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create);
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 
 	return x;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 9602cc9e05ab..79245e1c3487 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -594,6 +594,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 
 	xfrm_smark_init(attrs, &x->props.smark);
 
+	if (attrs[XFRMA_IF_ID])
+		x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
 	err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
 	if (err)
 		goto error;
@@ -929,7 +932,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 		ret = copy_user_offload(&x->xso, skb);
 	if (ret)
 		goto out;
-
+	if (x->if_id) {
+		ret = nla_put_u32(skb, XFRMA_IF_ID, x->if_id);
+		if (ret)
+			goto out;
+	}
 	if (x->security)
 		ret = copy_sec_ctx(x->security, skb);
 out:
@@ -1278,6 +1285,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err;
 	u32 mark;
 	struct xfrm_mark m;
+	u32 if_id = 0;
 
 	p = nlmsg_data(nlh);
 	err = verify_spi_info(p->info.id.proto, p->min, p->max);
@@ -1290,6 +1298,10 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
 	x = NULL;
 
 	mark = xfrm_mark_get(attrs, &m);
+
+	if (attrs[XFRMA_IF_ID])
+		if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
 	if (p->info.seq) {
 		x = xfrm_find_acq_byseq(net, mark, p->info.seq);
 		if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) {
@@ -1300,7 +1312,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	if (!x)
 		x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid,
-				  p->info.id.proto, daddr,
+				  if_id, p->info.id.proto, daddr,
 				  &p->info.saddr, 1,
 				  family);
 	err = -ENOENT;
@@ -1588,6 +1600,9 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us
 
 	xfrm_mark_get(attrs, &xp->mark);
 
+	if (attrs[XFRMA_IF_ID])
+		xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
 	return xp;
  error:
 	*errp = err;
@@ -1733,6 +1748,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
 		err = copy_to_user_policy_type(xp->type, skb);
 	if (!err)
 		err = xfrm_mark_put(skb, &xp->mark);
+	if (!err)
+		err = xfrm_if_id_put(skb, xp->if_id);
 	if (err) {
 		nlmsg_cancel(skb, nlh);
 		return err;
@@ -1814,6 +1831,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int delete;
 	struct xfrm_mark m;
 	u32 mark = xfrm_mark_get(attrs, &m);
+	u32 if_id = 0;
 
 	p = nlmsg_data(nlh);
 	delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;
@@ -1826,8 +1844,11 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		return err;
 
+	if (attrs[XFRMA_IF_ID])
+		if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
 	if (p->index)
-		xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, delete, &err);
+		xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, delete, &err);
 	else {
 		struct nlattr *rt = attrs[XFRMA_SEC_CTX];
 		struct xfrm_sec_ctx *ctx;
@@ -1844,7 +1865,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 			if (err)
 				return err;
 		}
-		xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, &p->sel,
+		xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir, &p->sel,
 					   ctx, delete, &err);
 		security_xfrm_policy_free(ctx);
 	}
@@ -1967,6 +1988,10 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct
 	if (err)
 		goto out_cancel;
 
+	err = xfrm_if_id_put(skb, x->if_id);
+	if (err)
+		goto out_cancel;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -2109,6 +2134,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err = -ENOENT;
 	struct xfrm_mark m;
 	u32 mark = xfrm_mark_get(attrs, &m);
+	u32 if_id = 0;
 
 	err = copy_from_user_policy_type(&type, attrs);
 	if (err)
@@ -2118,8 +2144,11 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		return err;
 
+	if (attrs[XFRMA_IF_ID])
+		if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
 	if (p->index)
-		xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err);
+		xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, 0, &err);
 	else {
 		struct nlattr *rt = attrs[XFRMA_SEC_CTX];
 		struct xfrm_sec_ctx *ctx;
@@ -2136,7 +2165,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 			if (err)
 				return err;
 		}
-		xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir,
+		xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir,
 					   &p->sel, ctx, 0, &err);
 		security_xfrm_policy_free(ctx);
 	}
@@ -2520,6 +2549,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_OFFLOAD_DEV]	= { .len = sizeof(struct xfrm_user_offload) },
 	[XFRMA_SET_MARK]	= { .type = NLA_U32 },
 	[XFRMA_SET_MARK_MASK]	= { .type = NLA_U32 },
+	[XFRMA_IF_ID]		= { .type = NLA_U32 },
 };
 
 static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@ -2651,6 +2681,10 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct
 	if (err)
 		return err;
 
+	err = xfrm_if_id_put(skb, x->if_id);
+	if (err)
+		return err;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 }
@@ -2749,6 +2783,8 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
 		l += nla_total_size(sizeof(x->props.smark.v));
 		l += nla_total_size(sizeof(x->props.smark.m));
 	}
+	if (x->if_id)
+		l += nla_total_size(sizeof(x->if_id));
 
 	/* Must count x->lastused as it may become non-zero behind our back. */
 	l += nla_total_size_64bit(sizeof(u64));
@@ -2878,6 +2914,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
 		err = copy_to_user_policy_type(xp->type, skb);
 	if (!err)
 		err = xfrm_mark_put(skb, &xp->mark);
+	if (!err)
+		err = xfrm_if_id_put(skb, xp->if_id);
 	if (err) {
 		nlmsg_cancel(skb, nlh);
 		return err;
@@ -2994,6 +3032,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
 		err = copy_to_user_policy_type(xp->type, skb);
 	if (!err)
 		err = xfrm_mark_put(skb, &xp->mark);
+	if (!err)
+		err = xfrm_if_id_put(skb, xp->if_id);
 	if (err) {
 		nlmsg_cancel(skb, nlh);
 		return err;
@@ -3075,6 +3115,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e
 		err = copy_to_user_policy_type(xp->type, skb);
 	if (!err)
 		err = xfrm_mark_put(skb, &xp->mark);
+	if (!err)
+		err = xfrm_if_id_put(skb, xp->if_id);
 	if (err)
 		goto out_free_skb;
 
-- 
cgit v1.2.3


From f203b76d78092faf248db3f851840fbecf80b40e Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 12 Jun 2018 14:07:12 +0200
Subject: xfrm: Add virtual xfrm interfaces

This patch adds support for virtual xfrm interfaces.
Packets that are routed through such an interface
are guaranteed to be IPsec transformed or dropped.
It is a generic virtual interface that ensures IPsec
transformation, no need to know what happens behind
the interface. This means that we can tunnel IPv4 and
IPv6 through the same interface and support all xfrm
modes (tunnel, transport and beet) on it.

Co-developed-by: Lorenzo Colitti <lorenzo@google.com>
Co-developed-by: Benedict Wong <benedictwong@google.com>
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Benedict Wong <benedictwong@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Acked-by: Shannon Nelson <shannon.nelson@oracle.com>
Tested-by: Benedict Wong <benedictwong@google.com>
Tested-by: Antony Antony <antony@phenome.org>
Reviewed-by: Eyal Birger <eyal.birger@gmail.com>
---
 include/net/xfrm.h           |  24 ++
 include/uapi/linux/if_link.h |  10 +
 net/xfrm/Kconfig             |   8 +
 net/xfrm/Makefile            |   1 +
 net/xfrm/xfrm_input.c        |   3 +
 net/xfrm/xfrm_interface.c    | 972 +++++++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_policy.c       |  43 ++
 7 files changed, 1061 insertions(+)
 create mode 100644 net/xfrm/xfrm_interface.c

(limited to 'include/uapi/linux')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e8bada4d2a45..3fa578a6a819 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -23,6 +23,7 @@
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
 #include <net/flow.h>
+#include <net/gro_cells.h>
 
 #include <linux/interrupt.h>
 
@@ -293,6 +294,13 @@ struct xfrm_replay {
 	int	(*overflow)(struct xfrm_state *x, struct sk_buff *skb);
 };
 
+struct xfrm_if_cb {
+	struct xfrm_if	*(*decode_session)(struct sk_buff *skb);
+};
+
+void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb);
+void xfrm_if_unregister_cb(void);
+
 struct net_device;
 struct xfrm_type;
 struct xfrm_dst;
@@ -1039,6 +1047,22 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 
 void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
 
+struct xfrm_if_parms {
+	char name[IFNAMSIZ];	/* name of XFRM device */
+	int link;		/* ifindex of underlying L2 interface */
+	u32 if_id;		/* interface identifyer */
+};
+
+struct xfrm_if {
+	struct xfrm_if __rcu *next;	/* next interface in list */
+	struct net_device *dev;		/* virtual device associated with interface */
+	struct net_device *phydev;	/* physical device */
+	struct net *net;		/* netns for packet i/o */
+	struct xfrm_if_parms p;		/* interface parms */
+
+	struct gro_cells gro_cells;
+};
+
 struct xfrm_offload {
 	/* Output sequence number for replay protection on offloading. */
 	struct {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index cf01b6824244..bff0af507b32 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -459,6 +459,16 @@ enum {
 
 #define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1)
 
+/* XFRM section */
+enum {
+	IFLA_XFRM_UNSPEC,
+	IFLA_XFRM_LINK,
+	IFLA_XFRM_IF_ID,
+	__IFLA_XFRM_MAX
+};
+
+#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1)
+
 enum macsec_validation_type {
 	MACSEC_VALIDATE_DISABLED = 0,
 	MACSEC_VALIDATE_CHECK = 1,
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 286ed25c1a69..53381888a7b3 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -25,6 +25,14 @@ config XFRM_USER
 
 	  If unsure, say Y.
 
+config XFRM_INTERFACE
+	tristate "Transformation virtual interface"
+	depends on XFRM && IPV6
+	---help---
+	  This provides a virtual interface to route IPsec traffic.
+
+	  If unsure, say N.
+
 config XFRM_SUB_POLICY
 	bool "Transformation sub policy support"
 	depends on XFRM
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 0bd2465a8c5a..fbc4552d17b8 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
 obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
 obj-$(CONFIG_XFRM_USER) += xfrm_user.o
 obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
+obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 074810436242..b89c9c7f8c5c 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -320,6 +320,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 
 	seq = 0;
 	if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) {
+		secpath_reset(skb);
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
 		goto drop;
 	}
@@ -328,12 +329,14 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 				   XFRM_SPI_SKB_CB(skb)->daddroff);
 	do {
 		if (skb->sp->len == XFRM_MAX_DEPTH) {
+			secpath_reset(skb);
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
 			goto drop;
 		}
 
 		x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family);
 		if (x == NULL) {
+			secpath_reset(skb);
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
 			xfrm_audit_state_notfound(skb, family, spi, seq);
 			goto drop;
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
new file mode 100644
index 000000000000..31cb1c7e3881
--- /dev/null
+++ b/net/xfrm/xfrm_interface.c
@@ -0,0 +1,972 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *	XFRM virtual interface
+ *
+ *	Copyright (C) 2018 secunet Security Networks AG
+ *
+ *	Author:
+ *	Steffen Klassert <steffen.klassert@secunet.com>
+ */
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/sockios.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_link.h>
+#include <linux/if_arp.h>
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/route.h>
+#include <linux/rtnetlink.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/etherdevice.h>
+
+static int xfrmi_dev_init(struct net_device *dev);
+static void xfrmi_dev_setup(struct net_device *dev);
+static struct rtnl_link_ops xfrmi_link_ops __read_mostly;
+static unsigned int xfrmi_net_id __read_mostly;
+
+struct xfrmi_net {
+	/* lists for storing interfaces in use */
+	struct xfrm_if __rcu *xfrmi[1];
+};
+
+#define for_each_xfrmi_rcu(start, xi) \
+	for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next))
+
+static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x)
+{
+	struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+	struct xfrm_if *xi;
+
+	for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) {
+		if (x->if_id == xi->p.if_id &&
+		    (xi->dev->flags & IFF_UP))
+			return xi;
+	}
+
+	return NULL;
+}
+
+static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb)
+{
+	struct xfrmi_net *xfrmn;
+	int ifindex;
+	struct xfrm_if *xi;
+
+	if (!skb->dev)
+		return NULL;
+
+	xfrmn = net_generic(dev_net(skb->dev), xfrmi_net_id);
+	ifindex = skb->dev->ifindex;
+
+	for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) {
+		if (ifindex == xi->dev->ifindex &&
+			(xi->dev->flags & IFF_UP))
+				return xi;
+	}
+
+	return NULL;
+}
+
+static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
+{
+	struct xfrm_if __rcu **xip = &xfrmn->xfrmi[0];
+
+	rcu_assign_pointer(xi->next , rtnl_dereference(*xip));
+	rcu_assign_pointer(*xip, xi);
+}
+
+static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
+{
+	struct xfrm_if __rcu **xip;
+	struct xfrm_if *iter;
+
+	for (xip = &xfrmn->xfrmi[0];
+	     (iter = rtnl_dereference(*xip)) != NULL;
+	     xip = &iter->next) {
+		if (xi == iter) {
+			rcu_assign_pointer(*xip, xi->next);
+			break;
+		}
+	}
+}
+
+static void xfrmi_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+}
+
+static int xfrmi_create2(struct net_device *dev)
+{
+	struct xfrm_if *xi = netdev_priv(dev);
+	struct net *net = dev_net(dev);
+	struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+	int err;
+
+	dev->rtnl_link_ops = &xfrmi_link_ops;
+	err = register_netdevice(dev);
+	if (err < 0)
+		goto out;
+
+	strcpy(xi->p.name, dev->name);
+
+	dev_hold(dev);
+	xfrmi_link(xfrmn, xi);
+
+	return 0;
+
+out:
+	return err;
+}
+
+static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p)
+{
+	struct net_device *dev;
+	struct xfrm_if *xi;
+	char name[IFNAMSIZ];
+	int err;
+
+	if (p->name[0])
+		strlcpy(name, p->name, IFNAMSIZ);
+	else
+		goto failed;
+
+	dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup);
+	if (!dev)
+		goto failed;
+
+	dev_net_set(dev, net);
+
+	xi = netdev_priv(dev);
+	xi->p = *p;
+	xi->net = net;
+	xi->dev = dev;
+	xi->phydev = dev_get_by_index(net, p->link);
+	if (!xi->phydev)
+		goto failed_free;
+
+	err = xfrmi_create2(dev);
+	if (err < 0)
+		goto failed_dev_put;
+
+	return xi;
+
+failed_dev_put:
+	dev_put(xi->phydev);
+failed_free:
+	free_netdev(dev);
+failed:
+	return NULL;
+}
+
+static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p,
+				   int create)
+{
+	struct xfrm_if __rcu **xip;
+	struct xfrm_if *xi;
+	struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+
+	for (xip = &xfrmn->xfrmi[0];
+	     (xi = rtnl_dereference(*xip)) != NULL;
+	     xip = &xi->next) {
+		if (xi->p.if_id == p->if_id) {
+			if (create)
+				return NULL;
+
+			return xi;
+		}
+	}
+	if (!create)
+		return NULL;
+	return xfrmi_create(net, p);
+}
+
+static void xfrmi_dev_uninit(struct net_device *dev)
+{
+	struct xfrm_if *xi = netdev_priv(dev);
+	struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id);
+
+	xfrmi_unlink(xfrmn, xi);
+	dev_put(xi->phydev);
+	dev_put(dev);
+}
+
+static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
+{
+	skb->tstamp = 0;
+	skb->pkt_type = PACKET_HOST;
+	skb->skb_iif = 0;
+	skb->ignore_df = 0;
+	skb_dst_drop(skb);
+	nf_reset(skb);
+	nf_reset_trace(skb);
+
+	if (!xnet)
+		return;
+
+	ipvs_reset(skb);
+	secpath_reset(skb);
+	skb_orphan(skb);
+	skb->mark = 0;
+}
+
+static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
+{
+	struct pcpu_sw_netstats *tstats;
+	struct xfrm_mode *inner_mode;
+	struct net_device *dev;
+	struct xfrm_state *x;
+	struct xfrm_if *xi;
+	bool xnet;
+
+	if (err && !skb->sp)
+		return 0;
+
+	x = xfrm_input_state(skb);
+
+	xi = xfrmi_lookup(xs_net(x), x);
+	if (!xi)
+		return 1;
+
+	dev = xi->dev;
+	skb->dev = dev;
+
+	if (err) {
+		dev->stats.rx_errors++;
+		dev->stats.rx_dropped++;
+
+		return 0;
+	}
+
+	xnet = !net_eq(xi->net, dev_net(skb->dev));
+
+	if (xnet) {
+		inner_mode = x->inner_mode;
+
+		if (x->sel.family == AF_UNSPEC) {
+			inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+			if (inner_mode == NULL) {
+				XFRM_INC_STATS(dev_net(skb->dev),
+					       LINUX_MIB_XFRMINSTATEMODEERROR);
+				return -EINVAL;
+			}
+		}
+
+		if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb,
+				       inner_mode->afinfo->family))
+			return -EPERM;
+	}
+
+	xfrmi_scrub_packet(skb, xnet);
+
+	tstats = this_cpu_ptr(dev->tstats);
+
+	u64_stats_update_begin(&tstats->syncp);
+	tstats->rx_packets++;
+	tstats->rx_bytes += skb->len;
+	u64_stats_update_end(&tstats->syncp);
+
+	return 0;
+}
+
+static int
+xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
+{
+	struct xfrm_if *xi = netdev_priv(dev);
+	struct net_device_stats *stats = &xi->dev->stats;
+	struct dst_entry *dst = skb_dst(skb);
+	unsigned int length = skb->len;
+	struct net_device *tdev;
+	struct xfrm_state *x;
+	int err = -1;
+	int mtu;
+
+	if (!dst)
+		goto tx_err_link_failure;
+
+	fl->flowi_xfrm.if_id = xi->p.if_id;
+
+	dst_hold(dst);
+	dst = xfrm_lookup(xi->net, dst, fl, NULL, 0);
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		dst = NULL;
+		goto tx_err_link_failure;
+	}
+
+	x = dst->xfrm;
+	if (!x)
+		goto tx_err_link_failure;
+
+	if (x->if_id != xi->p.if_id)
+		goto tx_err_link_failure;
+
+	tdev = dst->dev;
+
+	if (tdev == dev) {
+		stats->collisions++;
+		net_warn_ratelimited("%s: Local routing loop detected!\n",
+				     xi->p.name);
+		goto tx_err_dst_release;
+	}
+
+	mtu = dst_mtu(dst);
+	if (!skb->ignore_df && skb->len > mtu) {
+		skb_dst_update_pmtu(skb, mtu);
+
+		if (skb->protocol == htons(ETH_P_IPV6)) {
+			if (mtu < IPV6_MIN_MTU)
+				mtu = IPV6_MIN_MTU;
+
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		} else {
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+				  htonl(mtu));
+		}
+
+		dst_release(dst);
+		return -EMSGSIZE;
+	}
+
+	xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev)));
+	skb_dst_set(skb, dst);
+	skb->dev = tdev;
+
+	err = dst_output(xi->net, skb->sk, skb);
+	if (net_xmit_eval(err) == 0) {
+		struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
+
+		u64_stats_update_begin(&tstats->syncp);
+		tstats->tx_bytes += length;
+		tstats->tx_packets++;
+		u64_stats_update_end(&tstats->syncp);
+	} else {
+		stats->tx_errors++;
+		stats->tx_aborted_errors++;
+	}
+
+	return 0;
+tx_err_link_failure:
+	stats->tx_carrier_errors++;
+	dst_link_failure(skb);
+tx_err_dst_release:
+	dst_release(dst);
+	return err;
+}
+
+static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct xfrm_if *xi = netdev_priv(dev);
+	struct net_device_stats *stats = &xi->dev->stats;
+	struct flowi fl;
+	int ret;
+
+	memset(&fl, 0, sizeof(fl));
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IPV6):
+		xfrm_decode_session(skb, &fl, AF_INET6);
+		memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+		break;
+	case htons(ETH_P_IP):
+		xfrm_decode_session(skb, &fl, AF_INET);
+		memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+		break;
+	default:
+		goto tx_err;
+	}
+
+	fl.flowi_oif = xi->phydev->ifindex;
+
+	ret = xfrmi_xmit2(skb, dev, &fl);
+	if (ret < 0)
+		goto tx_err;
+
+	return NETDEV_TX_OK;
+
+tx_err:
+	stats->tx_errors++;
+	stats->tx_dropped++;
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int xfrmi4_err(struct sk_buff *skb, u32 info)
+{
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct net *net = dev_net(skb->dev);
+	int protocol = iph->protocol;
+	struct ip_comp_hdr *ipch;
+	struct ip_esp_hdr *esph;
+	struct ip_auth_hdr *ah ;
+	struct xfrm_state *x;
+	struct xfrm_if *xi;
+	__be32 spi;
+
+	switch (protocol) {
+	case IPPROTO_ESP:
+		esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+		spi = esph->spi;
+		break;
+	case IPPROTO_AH:
+		ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+		spi = ah->spi;
+		break;
+	case IPPROTO_COMP:
+		ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+		spi = htonl(ntohs(ipch->cpi));
+		break;
+	default:
+		return 0;
+	}
+
+	switch (icmp_hdr(skb)->type) {
+	case ICMP_DEST_UNREACH:
+		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+			return 0;
+	case ICMP_REDIRECT:
+		break;
+	default:
+		return 0;
+	}
+
+	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+			      spi, protocol, AF_INET);
+	if (!x)
+		return 0;
+
+	xi = xfrmi_lookup(net, x);
+	if (!xi) {
+		xfrm_state_put(x);
+		return -1;
+	}
+
+	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+		ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+	else
+		ipv4_redirect(skb, net, 0, 0, protocol, 0);
+	xfrm_state_put(x);
+
+	return 0;
+}
+
+static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+		    u8 type, u8 code, int offset, __be32 info)
+{
+	const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+	struct net *net = dev_net(skb->dev);
+	int protocol = iph->nexthdr;
+	struct ip_comp_hdr *ipch;
+	struct ip_esp_hdr *esph;
+	struct ip_auth_hdr *ah;
+	struct xfrm_state *x;
+	struct xfrm_if *xi;
+	__be32 spi;
+
+	switch (protocol) {
+	case IPPROTO_ESP:
+		esph = (struct ip_esp_hdr *)(skb->data + offset);
+		spi = esph->spi;
+		break;
+	case IPPROTO_AH:
+		ah = (struct ip_auth_hdr *)(skb->data + offset);
+		spi = ah->spi;
+		break;
+	case IPPROTO_COMP:
+		ipch = (struct ip_comp_hdr *)(skb->data + offset);
+		spi = htonl(ntohs(ipch->cpi));
+		break;
+	default:
+		return 0;
+	}
+
+	if (type != ICMPV6_PKT_TOOBIG &&
+	    type != NDISC_REDIRECT)
+		return 0;
+
+	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+			      spi, protocol, AF_INET6);
+	if (!x)
+		return 0;
+
+	xi = xfrmi_lookup(net, x);
+	if (!xi) {
+		xfrm_state_put(x);
+		return -1;
+	}
+
+	if (type == NDISC_REDIRECT)
+		ip6_redirect(skb, net, skb->dev->ifindex, 0,
+			     sock_net_uid(net, NULL));
+	else
+		ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
+	xfrm_state_put(x);
+
+	return 0;
+}
+
+static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p)
+{
+	if (xi->p.link != p->link)
+		return -EINVAL;
+
+	xi->p.if_id = p->if_id;
+
+	return 0;
+}
+
+static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p)
+{
+	struct net *net = dev_net(xi->dev);
+	struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+	int err;
+
+	xfrmi_unlink(xfrmn, xi);
+	synchronize_net();
+	err = xfrmi_change(xi, p);
+	xfrmi_link(xfrmn, xi);
+	netdev_state_change(xi->dev);
+	return err;
+}
+
+static void xfrmi_get_stats64(struct net_device *dev,
+			       struct rtnl_link_stats64 *s)
+{
+	int cpu;
+
+	if (!dev->tstats)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct pcpu_sw_netstats *stats;
+		struct pcpu_sw_netstats tmp;
+		int start;
+
+		stats = per_cpu_ptr(dev->tstats, cpu);
+		do {
+			start = u64_stats_fetch_begin_irq(&stats->syncp);
+			tmp.rx_packets = stats->rx_packets;
+			tmp.rx_bytes   = stats->rx_bytes;
+			tmp.tx_packets = stats->tx_packets;
+			tmp.tx_bytes   = stats->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+		s->rx_packets += tmp.rx_packets;
+		s->rx_bytes   += tmp.rx_bytes;
+		s->tx_packets += tmp.tx_packets;
+		s->tx_bytes   += tmp.tx_bytes;
+	}
+
+	s->rx_dropped = dev->stats.rx_dropped;
+	s->tx_dropped = dev->stats.tx_dropped;
+}
+
+static int xfrmi_get_iflink(const struct net_device *dev)
+{
+	struct xfrm_if *xi = netdev_priv(dev);
+
+	return xi->phydev->ifindex;
+}
+
+
+static const struct net_device_ops xfrmi_netdev_ops = {
+	.ndo_init	= xfrmi_dev_init,
+	.ndo_uninit	= xfrmi_dev_uninit,
+	.ndo_start_xmit = xfrmi_xmit,
+	.ndo_get_stats64 = xfrmi_get_stats64,
+	.ndo_get_iflink = xfrmi_get_iflink,
+};
+
+static void xfrmi_dev_setup(struct net_device *dev)
+{
+	dev->netdev_ops 	= &xfrmi_netdev_ops;
+	dev->type		= ARPHRD_NONE;
+	dev->hard_header_len 	= ETH_HLEN;
+	dev->min_header_len	= ETH_HLEN;
+	dev->mtu		= ETH_DATA_LEN;
+	dev->min_mtu		= ETH_MIN_MTU;
+	dev->max_mtu		= ETH_DATA_LEN;
+	dev->addr_len		= ETH_ALEN;
+	dev->flags 		= IFF_NOARP;
+	dev->needs_free_netdev	= true;
+	dev->priv_destructor	= xfrmi_dev_free;
+	netif_keep_dst(dev);
+}
+
+static int xfrmi_dev_init(struct net_device *dev)
+{
+	struct xfrm_if *xi = netdev_priv(dev);
+	struct net_device *phydev = xi->phydev;
+	int err;
+
+	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	err = gro_cells_init(&xi->gro_cells, dev);
+	if (err) {
+		free_percpu(dev->tstats);
+		return err;
+	}
+
+	dev->features |= NETIF_F_LLTX;
+
+	dev->needed_headroom = phydev->needed_headroom;
+	dev->needed_tailroom = phydev->needed_tailroom;
+
+	if (is_zero_ether_addr(dev->dev_addr))
+		eth_hw_addr_inherit(dev, phydev);
+	if (is_zero_ether_addr(dev->broadcast))
+		memcpy(dev->broadcast, phydev->broadcast, dev->addr_len);
+
+	return 0;
+}
+
+static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[],
+			 struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+static void xfrmi_netlink_parms(struct nlattr *data[],
+			       struct xfrm_if_parms *parms)
+{
+	memset(parms, 0, sizeof(*parms));
+
+	if (!data)
+		return;
+
+	if (data[IFLA_XFRM_LINK])
+		parms->link = nla_get_u32(data[IFLA_XFRM_LINK]);
+
+	if (data[IFLA_XFRM_IF_ID])
+		parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]);
+}
+
+static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
+			struct nlattr *tb[], struct nlattr *data[],
+			struct netlink_ext_ack *extack)
+{
+	struct net *net = dev_net(dev);
+	struct xfrm_if_parms *p;
+	struct xfrm_if *xi;
+
+	xi = netdev_priv(dev);
+	p = &xi->p;
+
+	xfrmi_netlink_parms(data, p);
+
+	if (!tb[IFLA_IFNAME])
+		return -EINVAL;
+
+	nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ);
+
+	if (!xfrmi_locate(net, p, 1))
+		return -EEXIST;
+
+	return 0;
+}
+
+static void xfrmi_dellink(struct net_device *dev, struct list_head *head)
+{
+	unregister_netdevice_queue(dev, head);
+}
+
+static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
+			   struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
+{
+	struct xfrm_if *xi = netdev_priv(dev);
+	struct net *net = dev_net(dev);
+
+	xfrmi_netlink_parms(data, &xi->p);
+
+	xi = xfrmi_locate(net, &xi->p, 0);
+
+	if (xi) {
+		if (xi->dev != dev)
+			return -EEXIST;
+	} else
+		xi = netdev_priv(dev);
+
+	return xfrmi_update(xi, &xi->p);
+}
+
+static size_t xfrmi_get_size(const struct net_device *dev)
+{
+	return
+		/* IFLA_XFRM_LINK */
+		nla_total_size(4) +
+		/* IFLA_XFRM_IF_ID */
+		nla_total_size(4) +
+		0;
+}
+
+static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct xfrm_if *xi = netdev_priv(dev);
+	struct xfrm_if_parms *parm = &xi->p;
+
+	if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) ||
+	    nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+struct net *xfrmi_get_link_net(const struct net_device *dev)
+{
+	struct xfrm_if *xi = netdev_priv(dev);
+
+	return dev_net(xi->phydev);
+}
+
+static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = {
+	[IFLA_XFRM_LINK]	= { .type = NLA_U32 },
+	[IFLA_XFRM_IF_ID]	= { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {
+	.kind		= "xfrm",
+	.maxtype	= IFLA_XFRM_MAX,
+	.policy		= xfrmi_policy,
+	.priv_size	= sizeof(struct xfrm_if),
+	.setup		= xfrmi_dev_setup,
+	.validate	= xfrmi_validate,
+	.newlink	= xfrmi_newlink,
+	.dellink	= xfrmi_dellink,
+	.changelink	= xfrmi_changelink,
+	.get_size	= xfrmi_get_size,
+	.fill_info	= xfrmi_fill_info,
+	.get_link_net	= xfrmi_get_link_net,
+};
+
+static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn)
+{
+	struct xfrm_if *xi;
+	LIST_HEAD(list);
+
+	xi = rtnl_dereference(xfrmn->xfrmi[0]);
+	if (!xi)
+		return;
+
+	unregister_netdevice_queue(xi->dev, &list);
+	unregister_netdevice_many(&list);
+}
+
+static int __net_init xfrmi_init_net(struct net *net)
+{
+	return 0;
+}
+
+static void __net_exit xfrmi_exit_net(struct net *net)
+{
+	struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+
+	rtnl_lock();
+	xfrmi_destroy_interfaces(xfrmn);
+	rtnl_unlock();
+}
+
+static struct pernet_operations xfrmi_net_ops = {
+	.init = xfrmi_init_net,
+	.exit = xfrmi_exit_net,
+	.id   = &xfrmi_net_id,
+	.size = sizeof(struct xfrmi_net),
+};
+
+static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = {
+	.handler	=	xfrm6_rcv,
+	.cb_handler	=	xfrmi_rcv_cb,
+	.err_handler	=	xfrmi6_err,
+	.priority	=	10,
+};
+
+static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = {
+	.handler	=	xfrm6_rcv,
+	.cb_handler	=	xfrmi_rcv_cb,
+	.err_handler	=	xfrmi6_err,
+	.priority	=	10,
+};
+
+static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = {
+	.handler	=	xfrm6_rcv,
+	.cb_handler	=	xfrmi_rcv_cb,
+	.err_handler	=	xfrmi6_err,
+	.priority	=	10,
+};
+
+static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = {
+	.handler	=	xfrm4_rcv,
+	.input_handler	=	xfrm_input,
+	.cb_handler	=	xfrmi_rcv_cb,
+	.err_handler	=	xfrmi4_err,
+	.priority	=	10,
+};
+
+static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = {
+	.handler	=	xfrm4_rcv,
+	.input_handler	=	xfrm_input,
+	.cb_handler	=	xfrmi_rcv_cb,
+	.err_handler	=	xfrmi4_err,
+	.priority	=	10,
+};
+
+static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = {
+	.handler	=	xfrm4_rcv,
+	.input_handler	=	xfrm_input,
+	.cb_handler	=	xfrmi_rcv_cb,
+	.err_handler	=	xfrmi4_err,
+	.priority	=	10,
+};
+
+static int __init xfrmi4_init(void)
+{
+	int err;
+
+	err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP);
+	if (err < 0)
+		goto xfrm_proto_esp_failed;
+	err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH);
+	if (err < 0)
+		goto xfrm_proto_ah_failed;
+	err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
+	if (err < 0)
+		goto xfrm_proto_comp_failed;
+
+	return 0;
+
+xfrm_proto_comp_failed:
+	xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+	xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+	return err;
+}
+
+static void xfrmi4_fini(void)
+{
+	xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
+	xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
+	xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
+}
+
+static int __init xfrmi6_init(void)
+{
+	int err;
+
+	err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP);
+	if (err < 0)
+		goto xfrm_proto_esp_failed;
+	err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH);
+	if (err < 0)
+		goto xfrm_proto_ah_failed;
+	err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
+	if (err < 0)
+		goto xfrm_proto_comp_failed;
+
+	return 0;
+
+xfrm_proto_comp_failed:
+	xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+	xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+	return err;
+}
+
+static void xfrmi6_fini(void)
+{
+	xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
+	xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
+	xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
+}
+
+static const struct xfrm_if_cb xfrm_if_cb = {
+	.decode_session =	xfrmi_decode_session,
+};
+
+static int __init xfrmi_init(void)
+{
+	const char *msg;
+	int err;
+
+	pr_info("IPsec XFRM device driver\n");
+
+	msg = "tunnel device";
+	err = register_pernet_device(&xfrmi_net_ops);
+	if (err < 0)
+		goto pernet_dev_failed;
+
+	msg = "xfrm4 protocols";
+	err = xfrmi4_init();
+	if (err < 0)
+		goto xfrmi4_failed;
+
+	msg = "xfrm6 protocols";
+	err = xfrmi6_init();
+	if (err < 0)
+		goto xfrmi6_failed;
+
+
+	msg = "netlink interface";
+	err = rtnl_link_register(&xfrmi_link_ops);
+	if (err < 0)
+		goto rtnl_link_failed;
+
+	xfrm_if_register_cb(&xfrm_if_cb);
+
+	return err;
+
+rtnl_link_failed:
+	xfrmi6_fini();
+xfrmi6_failed:
+	xfrmi4_fini();
+xfrmi4_failed:
+	unregister_pernet_device(&xfrmi_net_ops);
+pernet_dev_failed:
+	pr_err("xfrmi init: failed to register %s\n", msg);
+	return err;
+}
+
+static void __exit xfrmi_fini(void)
+{
+	xfrm_if_unregister_cb();
+	rtnl_link_unregister(&xfrmi_link_ops);
+	xfrmi4_fini();
+	xfrmi6_fini();
+	unregister_pernet_device(&xfrmi_net_ops);
+}
+
+module_init(xfrmi_init);
+module_exit(xfrmi_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("xfrm");
+MODULE_ALIAS_NETDEV("xfrm0");
+MODULE_AUTHOR("Steffen Klassert");
+MODULE_DESCRIPTION("XFRM virtual interface");
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index fc0c69312b2c..d960ea6657b5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -47,6 +47,9 @@ struct xfrm_flo {
 
 static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst);
 static struct work_struct *xfrm_pcpu_work __read_mostly;
+static DEFINE_SPINLOCK(xfrm_if_cb_lock);
+static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
+
 static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 						__read_mostly;
@@ -119,6 +122,12 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa
 	return afinfo;
 }
 
+/* Called with rcu_read_lock(). */
+static const struct xfrm_if_cb *xfrm_if_get_cb(void)
+{
+	return rcu_dereference(xfrm_if_cb);
+}
+
 struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
 				    const xfrm_address_t *saddr,
 				    const xfrm_address_t *daddr,
@@ -2083,6 +2092,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
 
 	if (IS_ERR(xdst)) {
 		err = PTR_ERR(xdst);
+		if (err == -EREMOTE) {
+			xfrm_pols_put(pols, num_pols);
+			return NULL;
+		}
+
 		if (err != -EAGAIN)
 			goto error;
 		goto make_dummy_bundle;
@@ -2176,6 +2190,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
 			if (IS_ERR(xdst)) {
 				xfrm_pols_put(pols, num_pols);
 				err = PTR_ERR(xdst);
+				if (err == -EREMOTE)
+					goto nopol;
+
 				goto dropdst;
 			} else if (xdst == NULL) {
 				num_xfrms = 0;
@@ -2368,12 +2385,20 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
 			  unsigned int family, int reverse)
 {
 	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	const struct xfrm_if_cb *ifcb = xfrm_if_get_cb();
+	struct xfrm_if *xi;
 	int err;
 
 	if (unlikely(afinfo == NULL))
 		return -EAFNOSUPPORT;
 
 	afinfo->decode_session(skb, fl, reverse);
+	if (ifcb) {
+		xi = ifcb->decode_session(skb);
+		if (xi)
+			fl->flowi_xfrm.if_id = xi->p.if_id;
+	}
+
 	err = security_xfrm_decode_session(skb, &fl->flowi_secid);
 	rcu_read_unlock();
 	return err;
@@ -2828,6 +2853,21 @@ void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
 
+void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb)
+{
+	spin_lock(&xfrm_if_cb_lock);
+	rcu_assign_pointer(xfrm_if_cb, ifcb);
+	spin_unlock(&xfrm_if_cb_lock);
+}
+EXPORT_SYMBOL(xfrm_if_register_cb);
+
+void xfrm_if_unregister_cb(void)
+{
+	RCU_INIT_POINTER(xfrm_if_cb, NULL);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL(xfrm_if_unregister_cb);
+
 #ifdef CONFIG_XFRM_STATISTICS
 static int __net_init xfrm_statistics_init(struct net *net)
 {
@@ -3008,6 +3048,9 @@ void __init xfrm_init(void)
 	xfrm_dev_init();
 	seqcount_init(&xfrm_policy_hash_generation);
 	xfrm_input_init();
+
+	RCU_INIT_POINTER(xfrm_if_cb, NULL);
+	synchronize_rcu();
 }
 
 #ifdef CONFIG_AUDITSYSCALL
-- 
cgit v1.2.3


From d0dd63a8aee1ef89f2e48e554b796b9f9e4fcadb Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 16 Jun 2018 22:11:42 -0700
Subject: time: Introduce struct __kernel_itimerspec

struct itimerspec is not y2038-safe.

Introduce a new struct __kernel_itimerspec based on the kernel internal
y2038-safe struct itimerspec64.

The definition of struct __kernel_itimerspec includes two struct
__kernel_timespec.

Since struct __kernel_timespec has the same representation in native and
compat modes, so does struct __kernel_itimerspec. This helps have a common
entry point for syscalls using struct __kernel_itimerspec.

New y2038-safe syscalls will use this new type. Since most of the new
syscalls are just an update to the native syscalls with the type update,
place the new definition under CONFIG_64BIT_TIME. This helps architectures
that do not support the above config to keep using the old definition of
struct itimerspec.

Also change the get/put_itimerspec64 to use struct__kernel_itimerspec.
This will help 32 bit architectures to use the new syscalls when
architectures select CONFIG_64BIT_TIME.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: arnd@arndb.de
Cc: viro@zeniv.linux.org.uk
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Cc: y2038@lists.linaro.org
Link: https://lkml.kernel.org/r/20180617051144.29756-2-deepa.kernel@gmail.com
---
 include/linux/time.h      | 4 ++--
 include/linux/time64.h    | 1 +
 include/uapi/linux/time.h | 7 +++++++
 kernel/time/time.c        | 4 ++--
 4 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index aed74463592d..27d83fd2ae61 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -14,9 +14,9 @@ int get_timespec64(struct timespec64 *ts,
 int put_timespec64(const struct timespec64 *ts,
 		struct __kernel_timespec __user *uts);
 int get_itimerspec64(struct itimerspec64 *it,
-			const struct itimerspec __user *uit);
+			const struct __kernel_itimerspec __user *uit);
 int put_itimerspec64(const struct itimerspec64 *it,
-			struct itimerspec __user *uit);
+			struct __kernel_itimerspec __user *uit);
 
 extern time64_t mktime64(const unsigned int year, const unsigned int mon,
 			const unsigned int day, const unsigned int hour,
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 0a7b2f79cec7..05634afba0db 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -12,6 +12,7 @@ typedef __u64 timeu64_t;
  */
 #ifndef CONFIG_64BIT_TIME
 #define __kernel_timespec timespec
+#define __kernel_itimerspec itimerspec
 #endif
 
 #include <uapi/linux/time.h>
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index fcf936656493..6b56a2208be7 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -49,6 +49,13 @@ struct __kernel_timespec {
 };
 #endif
 
+#ifndef __kernel_itimerspec
+struct __kernel_itimerspec {
+	struct __kernel_timespec it_interval;    /* timer period */
+	struct __kernel_timespec it_value;       /* timer expiration */
+};
+#endif
+
 /*
  * legacy timeval structure, only embedded in structures that
  * traditionally used 'timeval' to pass time intervals (not absolute
diff --git a/kernel/time/time.c b/kernel/time/time.c
index b1225db61eb2..c0195225fdce 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -927,7 +927,7 @@ int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
 EXPORT_SYMBOL_GPL(compat_put_timespec64);
 
 int get_itimerspec64(struct itimerspec64 *it,
-			const struct itimerspec __user *uit)
+			const struct __kernel_itimerspec __user *uit)
 {
 	int ret;
 
@@ -942,7 +942,7 @@ int get_itimerspec64(struct itimerspec64 *it,
 EXPORT_SYMBOL_GPL(get_itimerspec64);
 
 int put_itimerspec64(const struct itimerspec64 *it,
-			struct itimerspec __user *uit)
+			struct __kernel_itimerspec __user *uit)
 {
 	int ret;
 
-- 
cgit v1.2.3


From fb223502ec0889444965f602f57b1f45f9e9845e Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 24 Jun 2018 10:02:54 -0400
Subject: tcp: add SNMP counter for zero-window drops

It will be helpful if we could display the drops due to zero window or no
enough window space.
So a new SNMP MIB entry is added to track this behavior.
This entry is named LINUX_MIB_TCPZEROWINDOWDROP and published in
/proc/net/netstat in TcpExt line as TCPZeroWindowDrop.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 1 +
 net/ipv4/proc.c           | 1 +
 net/ipv4/tcp_input.c      | 8 ++++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 750d89120335..97517f36a5f9 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -279,6 +279,7 @@ enum
 	LINUX_MIB_TCPDELIVERED,			/* TCPDelivered */
 	LINUX_MIB_TCPDELIVEREDCE,		/* TCPDeliveredCE */
 	LINUX_MIB_TCPACKCOMPRESSED,		/* TCPAckCompressed */
+	LINUX_MIB_TCPZEROWINDOWDROP,		/* TCPZeroWindowDrop */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 77350c1256ce..225ef3433fe5 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -287,6 +287,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
 	SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
 	SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
+	SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 76ca88f63b70..9c5b3415413f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4668,8 +4668,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	 *  Out of sequence packets to the out_of_order_queue.
 	 */
 	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
-		if (tcp_receive_window(tp) == 0)
+		if (tcp_receive_window(tp) == 0) {
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
 			goto out_of_window;
+		}
 
 		/* Ok. In sequence. In window. */
 queue_and_out:
@@ -4735,8 +4737,10 @@ drop:
 		/* If window is closed, drop tail of packet. But after
 		 * remembering D-SACK for its head made in previous line.
 		 */
-		if (!tcp_receive_window(tp))
+		if (!tcp_receive_window(tp)) {
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
 			goto out_of_window;
+		}
 		goto queue_and_out;
 	}
 
-- 
cgit v1.2.3


From 4d97f7d53da7dc830dbf416a3d2a6778d267ae68 Mon Sep 17 00:00:00 2001
From: Henry Wilson <henry.wilson@acentic.com>
Date: Thu, 31 May 2018 09:43:03 +0000
Subject: inotify: Add flag IN_MASK_CREATE for inotify_add_watch()

The flag IN_MASK_CREATE is introduced as a flag for inotiy_add_watch()
which prevents inotify from modifying any existing watches when invoked.
If the pathname specified in the call has a watched inode associated
with it and IN_MASK_CREATE is specified, fail with an errno of EEXIST.

Use of IN_MASK_CREATE with IN_MASK_ADD is reserved for future use and
will return EINVAL.

RATIONALE

In the current implementation, there is no way to prevent
inotify_add_watch() from modifying existing watch descriptors. Even if
the caller keeps a record of all watch descriptors collected, this is
only sufficient to detect that an existing watch descriptor may have
been modified.

The assumption that a particular path will map to the same inode over
multiple calls to inotify_add_watch() cannot be made as files can be
renamed or deleted.  It is also not possible to assume that two distinct
paths do no map to the same inode, due to hard-links or a dereferenced
symbolic link. Further uses of inotify_add_watch() to revert the change
may cause other watch descriptors to be modified or created, merely
compunding the problem. There is currently no system call such as
inotify_modify_watch() to explicity modify a watch descriptor, which
would be able to revert unwanted changes. Thus the caller cannot
guarantee to be able to revert any changes to existing watch decriptors.

Additionally the caller cannot assume that the events that are
associated with a watch descriptor are within the set requested, as any
future calls to inotify_add_watch() may unintentionally modify a watch
descriptor's mask. Thus it cannot currently be guaranteed that a watch
descriptor will only generate events which have been requested. The
program must filter events which come through its watch descriptor to
within its expected range.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Henry Wilson <henry.wilson@acentic.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/inotify/inotify_user.c | 9 ++++++++-
 include/linux/inotify.h          | 2 +-
 include/uapi/linux/inotify.h     | 1 +
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1cf5b779d862..6f48d325c350 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -510,6 +510,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 	__u32 old_mask, new_mask;
 	__u32 mask;
 	int add = (arg & IN_MASK_ADD);
+	int create = (arg & IN_MASK_CREATE);
 	int ret;
 
 	mask = inotify_arg_to_mask(arg);
@@ -517,6 +518,8 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
 	if (!fsn_mark)
 		return -ENOENT;
+	else if (create)
+		return -EEXIST;
 
 	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 
@@ -718,6 +721,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 	if (unlikely(!f.file))
 		return -EBADF;
 
+	/* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */
+	if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE)))
+		return -EINVAL;
+
 	/* verify that this is indeed an inotify instance */
 	if (unlikely(f.file->f_op != &inotify_fops)) {
 		ret = -EINVAL;
@@ -806,7 +813,7 @@ static int __init inotify_user_setup(void)
 	BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
 	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
 
-	BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
+	BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22);
 
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
 
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 44f9ffe72c87..6a24905f6e1e 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -18,6 +18,6 @@ extern struct ctl_table inotify_table[]; /* for sysctl */
 			  IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT | \
 			  IN_Q_OVERFLOW | IN_IGNORED | IN_ONLYDIR | \
 			  IN_DONT_FOLLOW | IN_EXCL_UNLINK | IN_MASK_ADD | \
-			  IN_ISDIR | IN_ONESHOT)
+			  IN_MASK_CREATE | IN_ISDIR | IN_ONESHOT)
 
 #endif	/* _LINUX_INOTIFY_H */
diff --git a/include/uapi/linux/inotify.h b/include/uapi/linux/inotify.h
index 4800bf2a531d..884b4846b630 100644
--- a/include/uapi/linux/inotify.h
+++ b/include/uapi/linux/inotify.h
@@ -53,6 +53,7 @@ struct inotify_event {
 #define IN_ONLYDIR		0x01000000	/* only watch the path if it is a directory */
 #define IN_DONT_FOLLOW		0x02000000	/* don't follow a sym link */
 #define IN_EXCL_UNLINK		0x04000000	/* exclude events on unlinked objects */
+#define IN_MASK_CREATE		0x10000000	/* only create watches */
 #define IN_MASK_ADD		0x20000000	/* add to the mask of an already existing watch */
 #define IN_ISDIR		0x40000000	/* event occurred against dir */
 #define IN_ONESHOT		0x80000000	/* only send event once */
-- 
cgit v1.2.3


From e1a98c163eb276b5b5e1bece560ed7f0b9eb3b49 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hansverk@cisco.com>
Date: Fri, 15 Jun 2018 09:19:46 -0400
Subject: media: media.h: remove __NEED_MEDIA_LEGACY_API

The __NEED_MEDIA_LEGACY_API define is 1) ugly and 2) dangerous
since it is all too easy for drivers to define it to get hold of
legacy defines. Instead just define what we need in media-device.c
which is the only place where we need the legacy define
(MEDIA_ENT_T_DEVNODE_UNKNOWN).

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/media-device.c | 13 ++++++++++---
 include/uapi/linux/media.h   |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c
index ae59c3177555..47bb2254fbfd 100644
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -16,9 +16,6 @@
  * GNU General Public License for more details.
  */
 
-/* We need to access legacy defines from linux/media.h */
-#define __NEED_MEDIA_LEGACY_API
-
 #include <linux/compat.h>
 #include <linux/export.h>
 #include <linux/idr.h>
@@ -35,6 +32,16 @@
 
 #ifdef CONFIG_MEDIA_CONTROLLER
 
+/*
+ * Legacy defines from linux/media.h. This is the only place we need this
+ * so we just define it here. The media.h header doesn't expose it to the
+ * kernel to prevent it from being used by drivers, but here (and only here!)
+ * we need it to handle the legacy behavior.
+ */
+#define MEDIA_ENT_SUBTYPE_MASK			0x0000ffff
+#define MEDIA_ENT_T_DEVNODE_UNKNOWN		(MEDIA_ENT_F_OLD_BASE | \
+						 MEDIA_ENT_SUBTYPE_MASK)
+
 /* -----------------------------------------------------------------------------
  * Userspace API
  */
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index c7e9a5cba24e..86c7dcc9cba3 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -348,7 +348,7 @@ struct media_v2_topology {
 #define MEDIA_IOC_SETUP_LINK	_IOWR('|', 0x03, struct media_link_desc)
 #define MEDIA_IOC_G_TOPOLOGY	_IOWR('|', 0x04, struct media_v2_topology)
 
-#if !defined(__KERNEL__) || defined(__NEED_MEDIA_LEGACY_API)
+#ifndef __KERNEL__
 
 /*
  * Legacy symbols used to avoid userspace compilation breakages.
-- 
cgit v1.2.3


From 0a9fe5c375b57fab6d18ed0a6a7f935eefb09db3 Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Wed, 27 Jun 2018 10:32:19 -0700
Subject: netem: slotting with non-uniform distribution

Extend slotting with support for non-uniform distributions. This is
similar to netem's non-uniform distribution delay feature.

Commit f043efeae2f1 ("netem: support delivering packets in delayed
time slots") added the slotting feature to approximate the behaviors
of media with packet aggregation but only supported a uniform
distribution for delays between transmission attempts. Tests with TCP
BBR with emulated wifi links with non-uniform distributions produced
more useful results.

Syntax:
   slot dist DISTRIBUTION DELAY JITTER [packets MAX_PACKETS] \
      [bytes MAX_BYTES]

The syntax and use of the distribution table is the same as in the
non-uniform distribution delay feature. A file DISTRIBUTION must be
present in TC_LIB_DIR (e.g. /usr/lib/tc) containing numbers scaled by
NETEM_DIST_SCALE. A random value x is selected from the table and it
takes DELAY + ( x * JITTER ) as delay. Correlation between values is not
supported.

Examples:
  Normal distribution delay with mean = 800us and stdev = 100us.
  > tc qdisc add dev eth0 root netem slot dist normal 800us 100us

  Optionally set the max slot size in bytes and/or packets.
  > tc qdisc add dev eth0 root netem slot dist normal 800us 100us \
    bytes 64k packets 42

Signed-off-by: Yousuk Seung <ysseung@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  3 ++
 net/sched/sch_netem.c          | 73 ++++++++++++++++++++++++++++--------------
 2 files changed, 52 insertions(+), 24 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096ae97b..bad3c03bcf43 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -539,6 +539,7 @@ enum {
 	TCA_NETEM_LATENCY64,
 	TCA_NETEM_JITTER64,
 	TCA_NETEM_SLOT,
+	TCA_NETEM_SLOT_DIST,
 	__TCA_NETEM_MAX,
 };
 
@@ -581,6 +582,8 @@ struct tc_netem_slot {
 	__s64   max_delay;
 	__s32   max_packets;
 	__s32   max_bytes;
+	__s64	dist_delay; /* nsec */
+	__s64	dist_jitter; /* nsec */
 };
 
 enum {
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 7d6801fc5340..ad18a2052416 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -68,6 +68,11 @@
 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
 */
 
+struct disttable {
+	u32  size;
+	s16 table[0];
+};
+
 struct netem_sched_data {
 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
 	struct rb_root t_root;
@@ -99,10 +104,7 @@ struct netem_sched_data {
 		u32 rho;
 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 
-	struct disttable {
-		u32  size;
-		s16 table[0];
-	} *delay_dist;
+	struct disttable *delay_dist;
 
 	enum  {
 		CLG_RANDOM,
@@ -142,6 +144,7 @@ struct netem_sched_data {
 		s32 bytes_left;
 	} slot;
 
+	struct disttable *slot_dist;
 };
 
 /* Time stamp put into socket buffer control block
@@ -180,7 +183,7 @@ static u32 get_crandom(struct crndstate *state)
 	u64 value, rho;
 	unsigned long answer;
 
-	if (state->rho == 0)	/* no correlation */
+	if (!state || state->rho == 0)	/* no correlation */
 		return prandom_u32();
 
 	value = prandom_u32();
@@ -601,10 +604,19 @@ finish_segs:
 
 static void get_slot_next(struct netem_sched_data *q, u64 now)
 {
-	q->slot.slot_next = now + q->slot_config.min_delay +
-		(prandom_u32() *
-			(q->slot_config.max_delay -
-				q->slot_config.min_delay) >> 32);
+	s64 next_delay;
+
+	if (!q->slot_dist)
+		next_delay = q->slot_config.min_delay +
+				(prandom_u32() *
+				 (q->slot_config.max_delay -
+				  q->slot_config.min_delay) >> 32);
+	else
+		next_delay = tabledist(q->slot_config.dist_delay,
+				       (s32)(q->slot_config.dist_jitter),
+				       NULL, q->slot_dist);
+
+	q->slot.slot_next = now + next_delay;
 	q->slot.packets_left = q->slot_config.max_packets;
 	q->slot.bytes_left = q->slot_config.max_bytes;
 }
@@ -721,9 +733,9 @@ static void dist_free(struct disttable *d)
  * signed 16 bit values.
  */
 
-static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
+static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
+			  const struct nlattr *attr)
 {
-	struct netem_sched_data *q = qdisc_priv(sch);
 	size_t n = nla_len(attr)/sizeof(__s16);
 	const __s16 *data = nla_data(attr);
 	spinlock_t *root_lock;
@@ -744,7 +756,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 	root_lock = qdisc_root_sleeping_lock(sch);
 
 	spin_lock_bh(root_lock);
-	swap(q->delay_dist, d);
+	swap(*tbl, d);
 	spin_unlock_bh(root_lock);
 
 	dist_free(d);
@@ -762,7 +774,8 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
 		q->slot_config.max_bytes = INT_MAX;
 	q->slot.packets_left = q->slot_config.max_packets;
 	q->slot.bytes_left = q->slot_config.max_bytes;
-	if (q->slot_config.min_delay | q->slot_config.max_delay)
+	if (q->slot_config.min_delay | q->slot_config.max_delay |
+	    q->slot_config.dist_jitter)
 		q->slot.slot_next = ktime_get_ns();
 	else
 		q->slot.slot_next = 0;
@@ -926,16 +939,17 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 	}
 
 	if (tb[TCA_NETEM_DELAY_DIST]) {
-		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
-		if (ret) {
-			/* recover clg and loss_model, in case of
-			 * q->clg and q->loss_model were modified
-			 * in get_loss_clg()
-			 */
-			q->clg = old_clg;
-			q->loss_model = old_loss_model;
-			return ret;
-		}
+		ret = get_dist_table(sch, &q->delay_dist,
+				     tb[TCA_NETEM_DELAY_DIST]);
+		if (ret)
+			goto get_table_failure;
+	}
+
+	if (tb[TCA_NETEM_SLOT_DIST]) {
+		ret = get_dist_table(sch, &q->slot_dist,
+				     tb[TCA_NETEM_SLOT_DIST]);
+		if (ret)
+			goto get_table_failure;
 	}
 
 	sch->limit = qopt->limit;
@@ -983,6 +997,15 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 		get_slot(q, tb[TCA_NETEM_SLOT]);
 
 	return ret;
+
+get_table_failure:
+	/* recover clg and loss_model, in case of
+	 * q->clg and q->loss_model were modified
+	 * in get_loss_clg()
+	 */
+	q->clg = old_clg;
+	q->loss_model = old_loss_model;
+	return ret;
 }
 
 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
@@ -1011,6 +1034,7 @@ static void netem_destroy(struct Qdisc *sch)
 	if (q->qdisc)
 		qdisc_destroy(q->qdisc);
 	dist_free(q->delay_dist);
+	dist_free(q->slot_dist);
 }
 
 static int dump_loss_model(const struct netem_sched_data *q,
@@ -1127,7 +1151,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (dump_loss_model(q, skb) != 0)
 		goto nla_put_failure;
 
-	if (q->slot_config.min_delay | q->slot_config.max_delay) {
+	if (q->slot_config.min_delay | q->slot_config.max_delay |
+	    q->slot_config.dist_jitter) {
 		slot = q->slot_config;
 		if (slot.max_packets == INT_MAX)
 			slot.max_packets = 0;
-- 
cgit v1.2.3


From d020d4559de9baf47cafa2669f29ea59d11a914c Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Wed, 27 Jun 2018 13:33:31 -0400
Subject: net sched actions: fix coding style in pedit headers

Fix coding style issues in tc pedit headers detected by the
checkpatch script.

Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_pedit.h        | 1 +
 include/uapi/linux/tc_act/tc_pedit.h | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 227a6f1d02f4..fac3ad4a86de 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -17,6 +17,7 @@ struct tcf_pedit {
 	struct tc_pedit_key	*tcfp_keys;
 	struct tcf_pedit_key_ex	*tcfp_keys_ex;
 };
+
 #define to_pedit(a) ((struct tcf_pedit *)a)
 
 static inline bool is_tcf_pedit(const struct tc_action *a)
diff --git a/include/uapi/linux/tc_act/tc_pedit.h b/include/uapi/linux/tc_act/tc_pedit.h
index 162d1094c41c..24ec792dacc1 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -17,13 +17,15 @@ enum {
 	TCA_PEDIT_KEY_EX,
 	__TCA_PEDIT_MAX
 };
+
 #define TCA_PEDIT_MAX (__TCA_PEDIT_MAX - 1)
-                                                                                
+
 enum {
 	TCA_PEDIT_KEY_EX_HTYPE = 1,
 	TCA_PEDIT_KEY_EX_CMD = 2,
 	__TCA_PEDIT_KEY_EX_MAX
 };
+
 #define TCA_PEDIT_KEY_EX_MAX (__TCA_PEDIT_KEY_EX_MAX - 1)
 
  /* TCA_PEDIT_KEY_EX_HDR_TYPE_NETWROK is a special case for legacy users. It
@@ -38,6 +40,7 @@ enum pedit_header_type {
 	TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
 	__PEDIT_HDR_TYPE_MAX,
 };
+
 #define TCA_PEDIT_HDR_TYPE_MAX (__PEDIT_HDR_TYPE_MAX - 1)
 
 enum pedit_cmd {
@@ -45,6 +48,7 @@ enum pedit_cmd {
 	TCA_PEDIT_KEY_EX_CMD_ADD = 1,
 	__PEDIT_CMD_MAX,
 };
+
 #define TCA_PEDIT_CMD_MAX (__PEDIT_CMD_MAX - 1)
 
 struct tc_pedit_key {
@@ -55,13 +59,14 @@ struct tc_pedit_key {
 	__u32           offmask;
 	__u32           shift;
 };
-                                                                                
+
 struct tc_pedit_sel {
 	tc_gen;
 	unsigned char           nkeys;
 	unsigned char           flags;
 	struct tc_pedit_key     keys[0];
 };
+
 #define tc_pedit tc_pedit_sel
 
 #endif
-- 
cgit v1.2.3


From 4c79579b44b1876444f4d04de31c1a37098a0350 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 26 Jun 2018 16:21:18 -0700
Subject: bpf: Change bpf_fib_lookup to return lookup status

For ACLs implemented using either FIB rules or FIB entries, the BPF
program needs the FIB lookup status to be able to drop the packet.
Since the bpf_fib_lookup API has not reached a released kernel yet,
change the return code to contain an encoding of the FIB lookup
result and return the nexthop device index in the params struct.

In addition, inform the BPF program of any post FIB lookup reason as
to why the packet needs to go up the stack.

The fib result for unicast routes must have an egress device, so remove
the check that it is non-NULL.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h   | 28 ++++++++++++---
 net/core/filter.c          | 86 +++++++++++++++++++++++++++++-----------------
 samples/bpf/xdp_fwd_kern.c |  8 ++---
 3 files changed, 81 insertions(+), 41 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 59b19b6a40d7..b7db3261c62d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1857,7 +1857,8 @@ union bpf_attr {
  *		is resolved), the nexthop address is returned in ipv4_dst
  *		or ipv6_dst based on family, smac is set to mac address of
  *		egress device, dmac is set to nexthop mac address, rt_metric
- *		is set to metric from route (IPv4/IPv6 only).
+ *		is set to metric from route (IPv4/IPv6 only), and ifindex
+ *		is set to the device index of the nexthop from the FIB lookup.
  *
  *             *plen* argument is the size of the passed in struct.
  *             *flags* argument can be a combination of one or more of the
@@ -1873,9 +1874,10 @@ union bpf_attr {
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
  *     Return
- *             Egress device index on success, 0 if packet needs to continue
- *             up the stack for further processing or a negative error in case
- *             of failure.
+ *		* < 0 if any input argument is invalid
+ *		*   0 on success (packet is forwarded, nexthop neighbor exists)
+ *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
+ *		*     packet is not forwarded or needs assist from full stack
  *
  * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
  *	Description
@@ -2612,6 +2614,18 @@ struct bpf_raw_tracepoint_args {
 #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
 #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
 
+enum {
+	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
+	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed; can be dropped */
+	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable; can be dropped */
+	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed; can be dropped */
+	BPF_FIB_LKUP_RET_NOT_FWDED,    /* packet is not forwarded */
+	BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
+	BPF_FIB_LKUP_RET_UNSUPP_LWT,   /* fwd requires encapsulation */
+	BPF_FIB_LKUP_RET_NO_NEIGH,     /* no neighbor entry for nh */
+	BPF_FIB_LKUP_RET_FRAG_NEEDED,  /* fragmentation required to fwd */
+};
+
 struct bpf_fib_lookup {
 	/* input:  network family for lookup (AF_INET, AF_INET6)
 	 * output: network family of egress nexthop
@@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
 
 	/* total length of packet from network header - used for MTU check */
 	__u16	tot_len;
-	__u32	ifindex;  /* L3 device index for lookup */
+
+	/* input: L3 device index for lookup
+	 * output: device index from FIB lookup
+	 */
+	__u32	ifindex;
 
 	union {
 		/* inputs to lookup */
diff --git a/net/core/filter.c b/net/core/filter.c
index e7f12e9f598c..0ca6907d7efe 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
 	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
 	params->h_vlan_TCI = 0;
 	params->h_vlan_proto = 0;
+	params->ifindex = dev->ifindex;
 
-	return dev->ifindex;
+	return 0;
 }
 #endif
 
@@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	/* verify forwarding is enabled on this interface */
 	in_dev = __in_dev_get_rcu(dev);
 	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
-		return 0;
+		return BPF_FIB_LKUP_RET_FWD_DISABLED;
 
 	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
 		fl4.flowi4_iif = 1;
@@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 		tb = fib_get_table(net, tbid);
 		if (unlikely(!tb))
-			return 0;
+			return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
 	} else {
@@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
 	}
 
-	if (err || res.type != RTN_UNICAST)
-		return 0;
+	if (err) {
+		/* map fib lookup errors to RTN_ type */
+		if (err == -EINVAL)
+			return BPF_FIB_LKUP_RET_BLACKHOLE;
+		if (err == -EHOSTUNREACH)
+			return BPF_FIB_LKUP_RET_UNREACHABLE;
+		if (err == -EACCES)
+			return BPF_FIB_LKUP_RET_PROHIBIT;
+
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
+	}
+
+	if (res.type != RTN_UNICAST)
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 	if (res.fi->fib_nhs > 1)
 		fib_select_path(net, &res, &fl4, NULL);
@@ -4144,19 +4157,16 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	if (check_mtu) {
 		mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
 		if (params->tot_len > mtu)
-			return 0;
+			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
 	}
 
 	nh = &res.fi->fib_nh[res.nh_sel];
 
 	/* do not handle lwt encaps right now */
 	if (nh->nh_lwtstate)
-		return 0;
+		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
 
 	dev = nh->nh_dev;
-	if (unlikely(!dev))
-		return 0;
-
 	if (nh->nh_gw)
 		params->ipv4_dst = nh->nh_gw;
 
@@ -4166,10 +4176,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	 * rcu_read_lock_bh is not needed here
 	 */
 	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
-	if (neigh)
-		return bpf_fib_set_fwd_params(params, neigh, dev);
+	if (!neigh)
+		return BPF_FIB_LKUP_RET_NO_NEIGH;
 
-	return 0;
+	return bpf_fib_set_fwd_params(params, neigh, dev);
 }
 #endif
 
@@ -4190,7 +4200,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 	/* link local addresses are never forwarded */
 	if (rt6_need_strict(dst) || rt6_need_strict(src))
-		return 0;
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 	dev = dev_get_by_index_rcu(net, params->ifindex);
 	if (unlikely(!dev))
@@ -4198,7 +4208,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 	idev = __in6_dev_get_safely(dev);
 	if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
-		return 0;
+		return BPF_FIB_LKUP_RET_FWD_DISABLED;
 
 	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
 		fl6.flowi6_iif = 1;
@@ -4225,7 +4235,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 		tb = ipv6_stub->fib6_get_table(net, tbid);
 		if (unlikely(!tb))
-			return 0;
+			return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
 	} else {
@@ -4238,11 +4248,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	}
 
 	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
-		return 0;
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+	if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
+		switch (f6i->fib6_type) {
+		case RTN_BLACKHOLE:
+			return BPF_FIB_LKUP_RET_BLACKHOLE;
+		case RTN_UNREACHABLE:
+			return BPF_FIB_LKUP_RET_UNREACHABLE;
+		case RTN_PROHIBIT:
+			return BPF_FIB_LKUP_RET_PROHIBIT;
+		default:
+			return BPF_FIB_LKUP_RET_NOT_FWDED;
+		}
+	}
 
-	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
-	    f6i->fib6_type != RTN_UNICAST))
-		return 0;
+	if (f6i->fib6_type != RTN_UNICAST)
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
 		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
@@ -4252,11 +4274,11 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	if (check_mtu) {
 		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
 		if (params->tot_len > mtu)
-			return 0;
+			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
 	}
 
 	if (f6i->fib6_nh.nh_lwtstate)
-		return 0;
+		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
 
 	if (f6i->fib6_flags & RTF_GATEWAY)
 		*dst = f6i->fib6_nh.nh_gw;
@@ -4270,10 +4292,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	 */
 	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
 				      ndisc_hashfn, dst, dev);
-	if (neigh)
-		return bpf_fib_set_fwd_params(params, neigh, dev);
+	if (!neigh)
+		return BPF_FIB_LKUP_RET_NO_NEIGH;
 
-	return 0;
+	return bpf_fib_set_fwd_params(params, neigh, dev);
 }
 #endif
 
@@ -4315,7 +4337,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
 {
 	struct net *net = dev_net(skb->dev);
-	int index = -EAFNOSUPPORT;
+	int rc = -EAFNOSUPPORT;
 
 	if (plen < sizeof(*params))
 		return -EINVAL;
@@ -4326,25 +4348,25 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	switch (params->family) {
 #if IS_ENABLED(CONFIG_INET)
 	case AF_INET:
-		index = bpf_ipv4_fib_lookup(net, params, flags, false);
+		rc = bpf_ipv4_fib_lookup(net, params, flags, false);
 		break;
 #endif
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		index = bpf_ipv6_fib_lookup(net, params, flags, false);
+		rc = bpf_ipv6_fib_lookup(net, params, flags, false);
 		break;
 #endif
 	}
 
-	if (index > 0) {
+	if (!rc) {
 		struct net_device *dev;
 
-		dev = dev_get_by_index_rcu(net, index);
+		dev = dev_get_by_index_rcu(net, params->ifindex);
 		if (!is_skb_forwardable(dev, skb))
-			index = 0;
+			rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
 	}
 
-	return index;
+	return rc;
 }
 
 static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index 6673cdb9f55c..a7e94e7ff87d 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 	struct ethhdr *eth = data;
 	struct ipv6hdr *ip6h;
 	struct iphdr *iph;
-	int out_index;
 	u16 h_proto;
 	u64 nh_off;
+	int rc;
 
 	nh_off = sizeof(*eth);
 	if (data + nh_off > data_end)
@@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 
 	fib_params.ifindex = ctx->ingress_ifindex;
 
-	out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+	rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
 
 	/* verify egress index has xdp support
 	 * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
@@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 	 * NOTE: without verification that egress index supports XDP
 	 *       forwarding packets are dropped.
 	 */
-	if (out_index > 0) {
+	if (rc == 0) {
 		if (h_proto == htons(ETH_P_IP))
 			ip_decrease_ttl(iph);
 		else if (h_proto == htons(ETH_P_IPV6))
@@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 
 		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
 		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
-		return bpf_redirect_map(&tx_port, out_index, 0);
+		return bpf_redirect_map(&tx_port, fib_params.ifindex, 0);
 	}
 
 	return XDP_PASS;
-- 
cgit v1.2.3


From b6e71bdebb12cb79f931db358066a33f5f526b6a Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 27 Jun 2018 14:39:02 -0700
Subject: ila: Flush netlink command to clear xlat table

Add ILA_CMD_FLUSH netlink command to clear the ILA translation table.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ila.h |  1 +
 net/ipv6/ila/ila.h       |  1 +
 net/ipv6/ila/ila_main.c  |  6 +++++
 net/ipv6/ila/ila_xlat.c  | 62 ++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 68 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index 483b77af4eb8..db45d3e49a12 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -30,6 +30,7 @@ enum {
 	ILA_CMD_ADD,
 	ILA_CMD_DEL,
 	ILA_CMD_GET,
+	ILA_CMD_FLUSH,
 
 	__ILA_CMD_MAX,
 };
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index faba7824ea56..1f747bcbec29 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -123,6 +123,7 @@ void ila_xlat_exit_net(struct net *net);
 int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info);
 int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info);
 int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info);
 int ila_xlat_nl_dump_start(struct netlink_callback *cb);
 int ila_xlat_nl_dump_done(struct netlink_callback *cb);
 int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb);
diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c
index f6ac6b14577e..18fac76b9520 100644
--- a/net/ipv6/ila/ila_main.c
+++ b/net/ipv6/ila/ila_main.c
@@ -26,6 +26,12 @@ static const struct genl_ops ila_nl_ops[] = {
 		.policy = ila_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = ILA_CMD_FLUSH,
+		.doit = ila_xlat_nl_cmd_flush,
+		.policy = ila_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
 	{
 		.cmd = ILA_CMD_GET,
 		.doit = ila_xlat_nl_cmd_get_mapping,
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index d05de891dfb6..51a15ce50a64 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -164,9 +164,9 @@ static inline void ila_release(struct ila_map *ila)
 	kfree_rcu(ila, rcu);
 }
 
-static void ila_free_cb(void *ptr, void *arg)
+static void ila_free_node(struct ila_map *ila)
 {
-	struct ila_map *ila = (struct ila_map *)ptr, *next;
+	struct ila_map *next;
 
 	/* Assume rcu_readlock held */
 	while (ila) {
@@ -176,6 +176,11 @@ static void ila_free_cb(void *ptr, void *arg)
 	}
 }
 
+static void ila_free_cb(void *ptr, void *arg)
+{
+	ila_free_node((struct ila_map *)ptr);
+}
+
 static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila);
 
 static unsigned int
@@ -365,6 +370,59 @@ int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
+static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan,
+					    struct ila_map *ila)
+{
+	return ila_get_lock(ilan, ila->xp.ip.locator_match);
+}
+
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct ila_net *ilan = net_generic(net, ila_net_id);
+	struct rhashtable_iter iter;
+	struct ila_map *ila;
+	spinlock_t *lock;
+	int ret;
+
+	ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter, GFP_KERNEL);
+	if (ret)
+		goto done;
+
+	rhashtable_walk_start(&iter);
+
+	for (;;) {
+		ila = rhashtable_walk_next(&iter);
+
+		if (IS_ERR(ila)) {
+			if (PTR_ERR(ila) == -EAGAIN)
+				continue;
+			ret = PTR_ERR(ila);
+			goto done;
+		} else if (!ila) {
+			break;
+		}
+
+		lock = lock_from_ila_map(ilan, ila);
+
+		spin_lock(lock);
+
+		ret = rhashtable_remove_fast(&ilan->xlat.rhash_table,
+					     &ila->node, rht_params);
+		if (!ret)
+			ila_free_node(ila);
+
+		spin_unlock(lock);
+
+		if (ret)
+			break;
+	}
+
+done:
+	rhashtable_walk_stop(&iter);
+	return ret;
+}
+
 static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
 {
 	if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR,
-- 
cgit v1.2.3


From b0e9a2fe3ff971950833bc0ffc383babd9443bc4 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 28 Jun 2018 15:31:00 +0800
Subject: sctp: add support for SCTP_REUSE_PORT sockopt

This feature is actually already supported by sk->sk_reuse which can be
set by socket level opt SO_REUSEADDR. But it's not working exactly as
RFC6458 demands in section 8.1.27, like:

  - This option only supports one-to-one style SCTP sockets
  - This socket option must not be used after calling bind()
    or sctp_bindx().

Besides, SCTP_REUSE_PORT sockopt should be provided for user's programs.
Otherwise, the programs with SCTP_REUSE_PORT from other systems will not
work in linux.

To separate it from the socket level version, this patch adds 'reuse' in
sctp_sock and it works pretty much as sk->sk_reuse, but with some extra
setup limitations that are needed when it is being enabled.

"It should be noted that the behavior of the socket-level socket option
to reuse ports and/or addresses for SCTP sockets is unspecified", so it
leaves SO_REUSEADDR as is for the compatibility.

Note that the name SCTP_REUSE_PORT is somewhat confusing, as its
functionality is nearly identical to SO_REUSEADDR, but with some
extra restrictions. Here it uses 'reuse' in sctp_sock instead of
'reuseport'. As for sk->sk_reuseport support for SCTP, it will be
added in another patch.

Thanks to Neil to make this clear.

v1->v2:
  - add sctp_sk->reuse to separate it from the socket level version.
v2->v3:
  - improve changelog according to Marcelo's suggestion.

Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  1 +
 include/uapi/linux/sctp.h  |  1 +
 net/sctp/socket.c          | 62 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 57 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e0f962d27386..701a51736fa5 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -220,6 +220,7 @@ struct sctp_sock {
 	__u32 adaptation_ind;
 	__u32 pd_point;
 	__u16	nodelay:1,
+		reuse:1,
 		disable_fragments:1,
 		v4mapped:1,
 		frag_interleave:1,
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index b64d583bf053..c02986a284db 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -100,6 +100,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_RECVNXTINFO	33
 #define SCTP_DEFAULT_SNDINFO	34
 #define SCTP_AUTH_DEACTIVATE_KEY	35
+#define SCTP_REUSE_PORT		36
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0e91e83eea5a..bf11f9cacb63 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4170,6 +4170,28 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval,
+				      unsigned int optlen)
+{
+	int val;
+
+	if (!sctp_style(sk, TCP))
+		return -EOPNOTSUPP;
+
+	if (sctp_sk(sk)->ep->base.bind_addr.port)
+		return -EFAULT;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	sctp_sk(sk)->reuse = !!val;
+
+	return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4364,6 +4386,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_setsockopt_interleaving_supported(sk, optval,
 								optlen);
 		break;
+	case SCTP_REUSE_PORT:
+		retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -7197,6 +7222,26 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_reuse_port(struct sock *sk, int len,
+				      char __user *optval,
+				      int __user *optlen)
+{
+	int val;
+
+	if (len < sizeof(int))
+		return -EINVAL;
+
+	len = sizeof(int);
+	val = sctp_sk(sk)->reuse;
+	if (put_user(len, optlen))
+		return -EFAULT;
+
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -7392,6 +7437,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_interleaving_supported(sk, len, optval,
 								optlen);
 		break;
+	case SCTP_REUSE_PORT:
+		retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -7429,6 +7477,7 @@ static struct sctp_bind_bucket *sctp_bucket_create(
 
 static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
 {
+	bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse);
 	struct sctp_bind_hashbucket *head; /* hash list */
 	struct sctp_bind_bucket *pp;
 	unsigned short snum;
@@ -7501,13 +7550,11 @@ pp_found:
 		 * used by other socket (pp->owner not empty); that other
 		 * socket is going to be sk2.
 		 */
-		int reuse = sk->sk_reuse;
 		struct sock *sk2;
 
 		pr_debug("%s: found a possible match\n", __func__);
 
-		if (pp->fastreuse && sk->sk_reuse &&
-			sk->sk_state != SCTP_SS_LISTENING)
+		if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING)
 			goto success;
 
 		/* Run through the list of sockets bound to the port
@@ -7525,7 +7572,7 @@ pp_found:
 			ep2 = sctp_sk(sk2)->ep;
 
 			if (sk == sk2 ||
-			    (reuse && sk2->sk_reuse &&
+			    (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) &&
 			     sk2->sk_state != SCTP_SS_LISTENING))
 				continue;
 
@@ -7549,12 +7596,12 @@ pp_not_found:
 	 * SO_REUSEADDR on this socket -sk-).
 	 */
 	if (hlist_empty(&pp->owner)) {
-		if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING)
+		if (reuse && sk->sk_state != SCTP_SS_LISTENING)
 			pp->fastreuse = 1;
 		else
 			pp->fastreuse = 0;
 	} else if (pp->fastreuse &&
-		(!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING))
+		   (!reuse || sk->sk_state == SCTP_SS_LISTENING))
 		pp->fastreuse = 0;
 
 	/* We are set, so fill up all the data in the hash table
@@ -7685,7 +7732,7 @@ int sctp_inet_listen(struct socket *sock, int backlog)
 		err = 0;
 		sctp_unhash_endpoint(ep);
 		sk->sk_state = SCTP_SS_CLOSED;
-		if (sk->sk_reuse)
+		if (sk->sk_reuse || sctp_sk(sk)->reuse)
 			sctp_sk(sk)->bind_hash->fastreuse = 1;
 		goto out;
 	}
@@ -8550,6 +8597,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newsk->sk_no_check_tx = sk->sk_no_check_tx;
 	newsk->sk_no_check_rx = sk->sk_no_check_rx;
 	newsk->sk_reuse = sk->sk_reuse;
+	sctp_sk(newsk)->reuse = sp->reuse;
 
 	newsk->sk_shutdown = sk->sk_shutdown;
 	newsk->sk_destruct = sctp_destruct_sock;
-- 
cgit v1.2.3


From 0ed5269f9e41f495c8e9020c85f5e1644c1afc57 Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Tue, 26 Jun 2018 21:39:37 -0700
Subject: net/sched: add tunnel option support to act_tunnel_key

Allow setting tunnel options using the act_tunnel_key action.

Options are expressed as class:type:data and multiple options
may be listed using a comma delimiter.

 # ip link add name geneve0 type geneve dstport 0 external
 # tc qdisc add dev eth0 ingress
 # tc filter add dev eth0 protocol ip parent ffff: \
     flower indev eth0 \
        ip_proto udp \
        action tunnel_key \
            set src_ip 10.0.99.192 \
            dst_ip 10.0.99.193 \
            dst_port 6081 \
            id 11 \
            geneve_opts 0102:80:00800022,0102:80:00800022 \
    action mirred egress redirect dev geneve0

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_tunnel_key.h |  26 ++++
 net/sched/act_tunnel_key.c                | 214 +++++++++++++++++++++++++++++-
 2 files changed, 236 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
index 72bbefe5d1d1..e284fec8c467 100644
--- a/include/uapi/linux/tc_act/tc_tunnel_key.h
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -36,9 +36,35 @@ enum {
 	TCA_TUNNEL_KEY_PAD,
 	TCA_TUNNEL_KEY_ENC_DST_PORT,	/* be16 */
 	TCA_TUNNEL_KEY_NO_CSUM,		/* u8 */
+	TCA_TUNNEL_KEY_ENC_OPTS,	/* Nested TCA_TUNNEL_KEY_ENC_OPTS_
+					 * attributes
+					 */
 	__TCA_TUNNEL_KEY_MAX,
 };
 
 #define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
 
+enum {
+	TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC,
+	TCA_TUNNEL_KEY_ENC_OPTS_GENEVE,		/* Nested
+						 * TCA_TUNNEL_KEY_ENC_OPTS_
+						 * attributes
+						 */
+	__TCA_TUNNEL_KEY_ENC_OPTS_MAX,
+};
+
+#define TCA_TUNNEL_KEY_ENC_OPTS_MAX (__TCA_TUNNEL_KEY_ENC_OPTS_MAX - 1)
+
+enum {
+	TCA_TUNNEL_KEY_ENC_OPT_GENEVE_UNSPEC,
+	TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS,		/* be16 */
+	TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE,		/* u8 */
+	TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA,		/* 4 to 128 bytes */
+
+	__TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX,
+};
+
+#define TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX \
+	(__TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX - 1)
+
 #endif
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 20e98ed8d498..ea203e386a92 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
+#include <net/geneve.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/dst.h>
@@ -57,6 +58,135 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
 	return action;
 }
 
+static const struct nla_policy
+enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
+	[TCA_TUNNEL_KEY_ENC_OPTS_GENEVE]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = {
+	[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]	   = { .type = NLA_U16 },
+	[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]	   = { .type = NLA_U8 },
+	[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]	   = { .type = NLA_BINARY,
+						       .len = 128 },
+};
+
+static int
+tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
+			   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1];
+	int err, data_len, opt_len;
+	u8 *data;
+
+	err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX,
+			       nla, geneve_opt_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] ||
+	    !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] ||
+	    !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) {
+		NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data");
+		return -EINVAL;
+	}
+
+	data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
+	data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
+	if (data_len < 4) {
+		NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long");
+		return -ERANGE;
+	}
+	if (data_len % 4) {
+		NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long");
+		return -ERANGE;
+	}
+
+	opt_len = sizeof(struct geneve_opt) + data_len;
+	if (dst) {
+		struct geneve_opt *opt = dst;
+
+		WARN_ON(dst_len < opt_len);
+
+		opt->opt_class =
+			nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]);
+		opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]);
+		opt->length = data_len / 4; /* length is in units of 4 bytes */
+		opt->r1 = 0;
+		opt->r2 = 0;
+		opt->r3 = 0;
+
+		memcpy(opt + 1, data, data_len);
+	}
+
+	return opt_len;
+}
+
+static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
+				int dst_len, struct netlink_ext_ack *extack)
+{
+	int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
+	const struct nlattr *attr, *head = nla_data(nla);
+
+	err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
+			   enc_opts_policy, extack);
+	if (err)
+		return err;
+
+	nla_for_each_attr(attr, head, len, rem) {
+		switch (nla_type(attr)) {
+		case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+			opt_len = tunnel_key_copy_geneve_opt(attr, dst,
+							     dst_len, extack);
+			if (opt_len < 0)
+				return opt_len;
+			opts_len += opt_len;
+			if (dst) {
+				dst_len -= opt_len;
+				dst += opt_len;
+			}
+			break;
+		}
+	}
+
+	if (!opts_len) {
+		NL_SET_ERR_MSG(extack, "Empty list of tunnel options");
+		return -EINVAL;
+	}
+
+	if (rem > 0) {
+		NL_SET_ERR_MSG(extack, "Trailing data after parsing tunnel key options attributes");
+		return -EINVAL;
+	}
+
+	return opts_len;
+}
+
+static int tunnel_key_get_opts_len(struct nlattr *nla,
+				   struct netlink_ext_ack *extack)
+{
+	return tunnel_key_copy_opts(nla, NULL, 0, extack);
+}
+
+static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
+			       int opts_len, struct netlink_ext_ack *extack)
+{
+	info->options_len = opts_len;
+	switch (nla_type(nla_data(nla))) {
+	case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+#if IS_ENABLED(CONFIG_INET)
+		info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+		return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+					    opts_len, extack);
+#else
+		return -EAFNOSUPPORT;
+#endif
+	default:
+		NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type");
+		return -EINVAL;
+	}
+}
+
 static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
 	[TCA_TUNNEL_KEY_PARMS]	    = { .len = sizeof(struct tc_tunnel_key) },
 	[TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
@@ -66,6 +196,7 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
 	[TCA_TUNNEL_KEY_ENC_KEY_ID]   = { .type = NLA_U32 },
 	[TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16},
 	[TCA_TUNNEL_KEY_NO_CSUM]      = { .type = NLA_U8 },
+	[TCA_TUNNEL_KEY_ENC_OPTS]     = { .type = NLA_NESTED },
 };
 
 static int tunnel_key_init(struct net *net, struct nlattr *nla,
@@ -81,6 +212,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	struct tcf_tunnel_key *t;
 	bool exists = false;
 	__be16 dst_port = 0;
+	int opts_len = 0;
 	__be64 key_id;
 	__be16 flags;
 	int ret = 0;
@@ -128,6 +260,15 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 		if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
 			dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
 
+		if (tb[TCA_TUNNEL_KEY_ENC_OPTS]) {
+			opts_len = tunnel_key_get_opts_len(tb[TCA_TUNNEL_KEY_ENC_OPTS],
+							   extack);
+			if (opts_len < 0) {
+				ret = opts_len;
+				goto err_out;
+			}
+		}
+
 		if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
 		    tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
 			__be32 saddr;
@@ -138,7 +279,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 
 			metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
 						    dst_port, flags,
-						    key_id, 0);
+						    key_id, opts_len);
 		} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
 			   tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
 			struct in6_addr saddr;
@@ -162,6 +303,14 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			goto err_out;
 		}
 
+		if (opts_len) {
+			ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS],
+						  &metadata->u.tun_info,
+						  opts_len, extack);
+			if (ret < 0)
+				goto err_out;
+		}
+
 		metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
 		break;
 	default:
@@ -234,6 +383,61 @@ static void tunnel_key_release(struct tc_action *a)
 	}
 }
 
+static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
+				       const struct ip_tunnel_info *info)
+{
+	int len = info->options_len;
+	u8 *src = (u8 *)(info + 1);
+	struct nlattr *start;
+
+	start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE);
+	if (!start)
+		return -EMSGSIZE;
+
+	while (len > 0) {
+		struct geneve_opt *opt = (struct geneve_opt *)src;
+
+		if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS,
+				 opt->opt_class) ||
+		    nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE,
+			       opt->type) ||
+		    nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA,
+			    opt->length * 4, opt + 1))
+			return -EMSGSIZE;
+
+		len -= sizeof(struct geneve_opt) + opt->length * 4;
+		src += sizeof(struct geneve_opt) + opt->length * 4;
+	}
+
+	nla_nest_end(skb, start);
+	return 0;
+}
+
+static int tunnel_key_opts_dump(struct sk_buff *skb,
+				const struct ip_tunnel_info *info)
+{
+	struct nlattr *start;
+	int err;
+
+	if (!info->options_len)
+		return 0;
+
+	start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS);
+	if (!start)
+		return -EMSGSIZE;
+
+	if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+		err = tunnel_key_geneve_opts_dump(skb, info);
+		if (err)
+			return err;
+	} else {
+		return -EINVAL;
+	}
+
+	nla_nest_end(skb, start);
+	return 0;
+}
+
 static int tunnel_key_dump_addresses(struct sk_buff *skb,
 				     const struct ip_tunnel_info *info)
 {
@@ -284,8 +488,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 		goto nla_put_failure;
 
 	if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
-		struct ip_tunnel_key *key =
-			&params->tcft_enc_metadata->u.tun_info.key;
+		struct ip_tunnel_info *info =
+			&params->tcft_enc_metadata->u.tun_info;
+		struct ip_tunnel_key *key = &info->key;
 		__be32 key_id = tunnel_id_to_key32(key->tun_id);
 
 		if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
@@ -293,7 +498,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 					      &params->tcft_enc_metadata->u.tun_info) ||
 		    nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) ||
 		    nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
-			       !(key->tun_flags & TUNNEL_CSUM)))
+			       !(key->tun_flags & TUNNEL_CSUM)) ||
+		    tunnel_key_opts_dump(skb, info))
 			goto nla_put_failure;
 	}
 
-- 
cgit v1.2.3


From b1277a226d8c519b8c33e23fe68b4e1658f15963 Mon Sep 17 00:00:00 2001
From: Christian König <ckoenig.leichtzumerken@gmail.com>
Date: Fri, 29 Jun 2018 19:55:03 -0500
Subject: PCI: Cleanup PCI_REBAR_CTRL_BAR_SHIFT handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cleanup PCI_REBAR_CTRL_BAR_SHIFT handling.  That was hard coded instead of
properly defined in the header for some reason.

Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c             | 6 +++---
 include/uapi/linux/pci_regs.h | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 44ccfb31363e..1b20c4392f09 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1193,7 +1193,7 @@ static void pci_restore_rebar_state(struct pci_dev *pdev)
 		res = pdev->resource + bar_idx;
 		size = order_base_2((resource_size(res) >> 20) | 1) - 1;
 		ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
-		ctrl |= size << 8;
+		ctrl |= size << PCI_REBAR_CTRL_BAR_SHIFT;
 		pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
 	}
 }
@@ -3098,7 +3098,7 @@ int pci_rebar_get_current_size(struct pci_dev *pdev, int bar)
 		return pos;
 
 	pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
-	return (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> 8;
+	return (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT;
 }
 
 /**
@@ -3121,7 +3121,7 @@ int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size)
 
 	pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
 	ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
-	ctrl |= size << 8;
+	ctrl |= size << PCI_REBAR_CTRL_BAR_SHIFT;
 	pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
 	return 0;
 }
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 4da87e2ef8a8..82e6b361204e 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -960,8 +960,9 @@
 #define PCI_REBAR_CTRL		8	/* control register */
 #define  PCI_REBAR_CTRL_BAR_IDX		0x00000007  /* BAR index */
 #define  PCI_REBAR_CTRL_NBAR_MASK	0x000000E0  /* # of resizable BARs */
-#define  PCI_REBAR_CTRL_NBAR_SHIFT	5  	    /* shift for # of BARs */
+#define  PCI_REBAR_CTRL_NBAR_SHIFT	5	    /* shift for # of BARs */
 #define  PCI_REBAR_CTRL_BAR_SIZE	0x00001F00  /* BAR size */
+#define  PCI_REBAR_CTRL_BAR_SHIFT	8	    /* shift for BAR size */
 
 /* Dynamic Power Allocation */
 #define PCI_DPA_CAP		4	/* capability register */
-- 
cgit v1.2.3


From ea5d0c32498e1a08ff5f3dbeafa4d74895851b0d Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Thu, 28 Jun 2018 00:22:56 -0400
Subject: tcp: add new SNMP counter for drops when try to queue in rcv queue

When sk_rmem_alloc is larger than the receive buffer and we can't
schedule more memory for it, the skb will be dropped.

In above situation, if this skb is put into the ofo queue,
LINUX_MIB_TCPOFODROP is incremented to track it.

While if this skb is put into the receive queue, there's no record.
So a new SNMP counter is introduced to track this behavior.

LINUX_MIB_TCPRCVQDROP:  Number of packets meant to be queued in rcv queue
			but dropped because socket rcvbuf limit hit.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 1 +
 net/ipv4/proc.c           | 1 +
 net/ipv4/tcp_input.c      | 8 ++++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 97517f36a5f9..e5ebc83827ab 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -280,6 +280,7 @@ enum
 	LINUX_MIB_TCPDELIVEREDCE,		/* TCPDeliveredCE */
 	LINUX_MIB_TCPACKCOMPRESSED,		/* TCPAckCompressed */
 	LINUX_MIB_TCPZEROWINDOWDROP,		/* TCPZeroWindowDrop */
+	LINUX_MIB_TCPRCVQDROP,			/* TCPRcvQDrop */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 225ef3433fe5..b46e4cf9a55a 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -288,6 +288,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
 	SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
 	SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
+	SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c5b3415413f..eecd359595fc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4611,8 +4611,10 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 	skb->data_len = data_len;
 	skb->len = size;
 
-	if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+	if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
 		goto err_free;
+	}
 
 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
 	if (err)
@@ -4677,8 +4679,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 queue_and_out:
 		if (skb_queue_len(&sk->sk_receive_queue) == 0)
 			sk_forced_mem_schedule(sk, skb->truesize);
-		else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+		else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
 			goto drop;
+		}
 
 		eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
 		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
-- 
cgit v1.2.3


From 4b1b7d3b30a6d32ac1a1dcede284e76ef8a8542d Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Thu, 28 Jun 2018 19:05:12 +0200
Subject: net/smc: add SMC-D diag support

This patch adds diag support for SMC-D.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Suggested-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/smc_diag.h | 10 ++++++++++
 net/smc/smc_diag.c            | 15 +++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h
index 0ae5d4685ba3..92be255e534c 100644
--- a/include/uapi/linux/smc_diag.h
+++ b/include/uapi/linux/smc_diag.h
@@ -35,6 +35,7 @@ enum {
 	SMC_DIAG_CONNINFO,
 	SMC_DIAG_LGRINFO,
 	SMC_DIAG_SHUTDOWN,
+	SMC_DIAG_DMBINFO,
 	__SMC_DIAG_MAX,
 };
 
@@ -83,4 +84,13 @@ struct smc_diag_lgrinfo {
 	struct smc_diag_linkinfo	lnk[1];
 	__u8				role;
 };
+
+struct smcd_diag_dmbinfo {		/* SMC-D Socket internals */
+	__u32 linkid;			/* Link identifier */
+	__u64 peer_gid;			/* Peer GID */
+	__u64 my_gid;			/* My GID */
+	__u64 token;			/* Token of DMB */
+	__u64 peer_token;		/* Token of remote DMBE */
+};
+
 #endif /* _UAPI_SMC_DIAG_H_ */
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 64ce107c24d9..6d83eef1b743 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -156,6 +156,21 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
 		if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
 			goto errout;
 	}
+	if (smc->conn.lgr && smc->conn.lgr->is_smcd &&
+	    (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
+	    !list_empty(&smc->conn.lgr->list)) {
+		struct smc_connection *conn = &smc->conn;
+		struct smcd_diag_dmbinfo dinfo = {
+			.linkid = *((u32 *)conn->lgr->id),
+			.peer_gid = conn->lgr->peer_gid,
+			.my_gid = conn->lgr->smcd->local_gid,
+			.token = conn->rmb_desc->token,
+			.peer_token = conn->peer_token
+		};
+
+		if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0)
+			goto errout;
+	}
 
 	nlmsg_end(skb, nlh);
 	return 0;
-- 
cgit v1.2.3


From a1be5a20f137bdf436bab86c18998229908ce951 Mon Sep 17 00:00:00 2001
From: GhantaKrishnamurthy MohanKrishna
 <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Date: Fri, 29 Jun 2018 13:26:18 +0200
Subject: tipc: extend sock diag for group communication

This commit extends the existing TIPC socket diagnostics framework
for information related to TIPC group communication.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_netlink.h | 14 ++++++++++++++
 net/tipc/group.c                  | 32 ++++++++++++++++++++++++++++++++
 net/tipc/group.h                  |  1 +
 net/tipc/socket.c                 |  5 +++++
 4 files changed, 52 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h
index 85c11982c89b..0ebe02ef1a86 100644
--- a/include/uapi/linux/tipc_netlink.h
+++ b/include/uapi/linux/tipc_netlink.h
@@ -121,6 +121,7 @@ enum {
 	TIPC_NLA_SOCK_TIPC_STATE,	/* u32 */
 	TIPC_NLA_SOCK_COOKIE,		/* u64 */
 	TIPC_NLA_SOCK_PAD,		/* flag */
+	TIPC_NLA_SOCK_GROUP,		/* nest */
 
 	__TIPC_NLA_SOCK_MAX,
 	TIPC_NLA_SOCK_MAX = __TIPC_NLA_SOCK_MAX - 1
@@ -233,6 +234,19 @@ enum {
 	TIPC_NLA_MON_PEER_MAX = __TIPC_NLA_MON_PEER_MAX - 1
 };
 
+/* Nest, socket group info */
+enum {
+	TIPC_NLA_SOCK_GROUP_ID,			/* u32 */
+	TIPC_NLA_SOCK_GROUP_OPEN,		/* flag */
+	TIPC_NLA_SOCK_GROUP_NODE_SCOPE,		/* flag */
+	TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE,	/* flag */
+	TIPC_NLA_SOCK_GROUP_INSTANCE,		/* u32 */
+	TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT,	/* u32 */
+
+	__TIPC_NLA_SOCK_GROUP_MAX,
+	TIPC_NLA_SOCK_GROUP_MAX = __TIPC_NLA_SOCK_GROUP_MAX - 1
+};
+
 /* Nest, connection info */
 enum {
 	TIPC_NLA_CON_UNSPEC,
diff --git a/net/tipc/group.c b/net/tipc/group.c
index d7a7befeddd4..cbe39e8db39c 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -918,3 +918,35 @@ void tipc_group_member_evt(struct tipc_group *grp,
 	}
 	*sk_rcvbuf = tipc_group_rcvbuf_limit(grp);
 }
+
+int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb)
+{
+	struct nlattr *group = nla_nest_start(skb, TIPC_NLA_SOCK_GROUP);
+
+	if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID,
+			grp->type) ||
+	    nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE,
+			grp->instance) ||
+	    nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT,
+			grp->bc_snd_nxt))
+		goto group_msg_cancel;
+
+	if (grp->scope == TIPC_NODE_SCOPE)
+		if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE))
+			goto group_msg_cancel;
+
+	if (grp->scope == TIPC_CLUSTER_SCOPE)
+		if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE))
+			goto group_msg_cancel;
+
+	if (*grp->open)
+		if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN))
+			goto group_msg_cancel;
+
+	nla_nest_end(skb, group);
+	return 0;
+
+group_msg_cancel:
+	nla_nest_cancel(skb, group);
+	return -1;
+}
diff --git a/net/tipc/group.h b/net/tipc/group.h
index 5996af6e9f1d..76b4e5a7b39d 100644
--- a/net/tipc/group.h
+++ b/net/tipc/group.h
@@ -72,4 +72,5 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
 			       u32 port, struct sk_buff_head *xmitq);
 u16 tipc_group_bc_snd_nxt(struct tipc_group *grp);
 void tipc_group_update_member(struct tipc_member *m, int len);
+int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb);
 #endif
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 14a5d055717d..840dd995f631 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -3316,6 +3316,11 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb,
 		goto stat_msg_cancel;
 
 	nla_nest_end(skb, stat);
+
+	if (tsk->group)
+		if (tipc_group_fill_sock_diag(tsk->group, skb))
+			goto stat_msg_cancel;
+
 	nla_nest_end(skb, attrs);
 
 	return 0;
-- 
cgit v1.2.3


From 7ce3f912ae0a79e5d738a3ae1f158b281973e849 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Sat, 30 Jun 2018 11:24:24 -0400
Subject: PCI: Enable PASID only if entire path supports End-End TLP prefixes

A PCIe endpoint carries the process address space identifier (PASID) in
the TLP prefix as part of the memory read/write transaction. The address
information in the TLP is relevant only for a given PASID context.

An IOMMU takes PASID value and the address information from the
TLP to look up the physical address in the system.

PASID is an End-End TLP Prefix (PCIe r4.0, sec 6.20).  Sec 2.2.10.2 says

  It is an error to receive a TLP with an End-End TLP Prefix by a
  Receiver that does not support End-End TLP Prefixes. A TLP in
  violation of this rule is handled as a Malformed TLP. This is a
  reported error associated with the Receiving Port (see Section 6.2).

Prevent error condition by proactively requiring End-End TLP prefix to be
supported on the entire data path between the endpoint and the root port
before enabling PASID.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/ats.c             |  3 +++
 drivers/pci/probe.c           | 24 ++++++++++++++++++++++++
 include/linux/pci.h           |  1 +
 include/uapi/linux/pci_regs.h |  1 +
 4 files changed, 29 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 4923a2a8e14b..5b78f3b1b918 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -273,6 +273,9 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
 	if (WARN_ON(pdev->pasid_enabled))
 		return -EBUSY;
 
+	if (!pdev->eetlp_prefix_path)
+		return -EINVAL;
+
 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
 	if (!pos)
 		return -EINVAL;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ac876e32de4b..4c35c2909d57 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2042,6 +2042,29 @@ static void pci_configure_ltr(struct pci_dev *dev)
 #endif
 }
 
+static void pci_configure_eetlp_prefix(struct pci_dev *dev)
+{
+#ifdef CONFIG_PCI_PASID
+	struct pci_dev *bridge;
+	u32 cap;
+
+	if (!pci_is_pcie(dev))
+		return;
+
+	pcie_capability_read_dword(dev, PCI_EXP_DEVCAP2, &cap);
+	if (!(cap & PCI_EXP_DEVCAP2_EE_PREFIX))
+		return;
+
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
+		dev->eetlp_prefix_path = 1;
+	else {
+		bridge = pci_upstream_bridge(dev);
+		if (bridge && bridge->eetlp_prefix_path)
+			dev->eetlp_prefix_path = 1;
+	}
+#endif
+}
+
 static void pci_configure_device(struct pci_dev *dev)
 {
 	struct hotplug_params hpp;
@@ -2051,6 +2074,7 @@ static void pci_configure_device(struct pci_dev *dev)
 	pci_configure_extended_tags(dev, NULL);
 	pci_configure_relaxed_ordering(dev);
 	pci_configure_ltr(dev);
+	pci_configure_eetlp_prefix(dev);
 
 	memset(&hpp, 0, sizeof(hpp));
 	ret = pci_get_hp_params(dev, &hpp);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 340029b2fb38..6ba818449095 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -350,6 +350,7 @@ struct pci_dev {
 	unsigned int	ltr_path:1;	/* Latency Tolerance Reporting
 					   supported from root to here */
 #endif
+	unsigned int	eetlp_prefix_path:1;	/* End-to-End TLP Prefix */
 
 	pci_channel_state_t error_state;	/* Current connectivity state */
 	struct device	dev;			/* Generic device interface */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 4da87e2ef8a8..04d7480db714 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -636,6 +636,7 @@
 #define  PCI_EXP_DEVCAP2_OBFF_MASK	0x000c0000 /* OBFF support mechanism */
 #define  PCI_EXP_DEVCAP2_OBFF_MSG	0x00040000 /* New message signaling */
 #define  PCI_EXP_DEVCAP2_OBFF_WAKE	0x00080000 /* Re-use WAKE# for OBFF */
+#define PCI_EXP_DEVCAP2_EE_PREFIX	0x00200000 /* End-End TLP Prefix */
 #define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
 #define  PCI_EXP_DEVCTL2_COMP_TIMEOUT	0x000f	/* Completion Timeout Value */
 #define  PCI_EXP_DEVCTL2_COMP_TMOUT_DIS	0x0010	/* Completion Timeout Disable */
-- 
cgit v1.2.3


From 0b0dce7a36fb9f1a9dd8245ea82d3a268c6943fe Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 2 Jul 2018 18:21:13 +0800
Subject: sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams

spp_ipv6_flowlabel and spp_dscp are added in sctp_paddrparams in
this patch so that users could set sctp_sock/asoc/transport dscp
and flowlabel with spp_flags SPP_IPV6_FLOWLABEL or SPP_DSCP by
SCTP_PEER_ADDR_PARAMS , as described section 8.1.12 in RFC6458.

As said in last patch, it uses '| 0x100000' or '|0x1' to mark
flowlabel or dscp is set,  so that their values could be set
to 0.

Note that to guarantee that an old app built with old kernel
headers could work on the newer kernel, the param's check in
sctp_g/setsockopt_peer_addr_params() is also improved, which
follows the way that sctp_g/setsockopt_delayed_ack() or some
other sockopts' process that accept two types of params does.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |   4 ++
 net/sctp/socket.c         | 177 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 175 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index c02986a284db..b479db5c71d9 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -763,6 +763,8 @@ enum  sctp_spp_flags {
 	SPP_SACKDELAY_DISABLE = 1<<6,	/*Disable SACK*/
 	SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE,
 	SPP_HB_TIME_IS_ZERO = 1<<7,	/* Set HB delay to 0 */
+	SPP_IPV6_FLOWLABEL = 1<<8,
+	SPP_DSCP = 1<<9,
 };
 
 struct sctp_paddrparams {
@@ -773,6 +775,8 @@ struct sctp_paddrparams {
 	__u32			spp_pathmtu;
 	__u32			spp_sackdelay;
 	__u32			spp_flags;
+	__u32			spp_ipv6_flowlabel;
+	__u8			spp_dscp;
 } __attribute__((packed, aligned(4)));
 
 /*
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0e4c8332771a..50b7ef975b42 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2393,6 +2393,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
  *     uint32_t                spp_pathmtu;
  *     uint32_t                spp_sackdelay;
  *     uint32_t                spp_flags;
+ *     uint32_t                spp_ipv6_flowlabel;
+ *     uint8_t                 spp_dscp;
  * };
  *
  *   spp_assoc_id    - (one-to-many style socket) This is filled in the
@@ -2472,6 +2474,45 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
  *                     also that this field is mutually exclusive to
  *                     SPP_SACKDELAY_ENABLE, setting both will have undefined
  *                     results.
+ *
+ *                     SPP_IPV6_FLOWLABEL:  Setting this flag enables the
+ *                     setting of the IPV6 flow label value.  The value is
+ *                     contained in the spp_ipv6_flowlabel field.
+ *                     Upon retrieval, this flag will be set to indicate that
+ *                     the spp_ipv6_flowlabel field has a valid value returned.
+ *                     If a specific destination address is set (in the
+ *                     spp_address field), then the value returned is that of
+ *                     the address.  If just an association is specified (and
+ *                     no address), then the association's default flow label
+ *                     is returned.  If neither an association nor a destination
+ *                     is specified, then the socket's default flow label is
+ *                     returned.  For non-IPv6 sockets, this flag will be left
+ *                     cleared.
+ *
+ *                     SPP_DSCP:  Setting this flag enables the setting of the
+ *                     Differentiated Services Code Point (DSCP) value
+ *                     associated with either the association or a specific
+ *                     address.  The value is obtained in the spp_dscp field.
+ *                     Upon retrieval, this flag will be set to indicate that
+ *                     the spp_dscp field has a valid value returned.  If a
+ *                     specific destination address is set when called (in the
+ *                     spp_address field), then that specific destination
+ *                     address's DSCP value is returned.  If just an association
+ *                     is specified, then the association's default DSCP is
+ *                     returned.  If neither an association nor a destination is
+ *                     specified, then the socket's default DSCP is returned.
+ *
+ *   spp_ipv6_flowlabel
+ *                   - This field is used in conjunction with the
+ *                     SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
+ *                     The 20 least significant bits are used for the flow
+ *                     label.  This setting has precedence over any IPv6-layer
+ *                     setting.
+ *
+ *   spp_dscp        - This field is used in conjunction with the SPP_DSCP flag
+ *                     and contains the DSCP.  The 6 most significant bits are
+ *                     used for the DSCP.  This setting has precedence over any
+ *                     IPv4- or IPv6- layer setting.
  */
 static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
 				       struct sctp_transport   *trans,
@@ -2611,6 +2652,51 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
 		}
 	}
 
+	if (params->spp_flags & SPP_IPV6_FLOWLABEL) {
+		if (trans && trans->ipaddr.sa.sa_family == AF_INET6) {
+			trans->flowlabel = params->spp_ipv6_flowlabel &
+					   SCTP_FLOWLABEL_VAL_MASK;
+			trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+		} else if (asoc) {
+			list_for_each_entry(trans,
+					    &asoc->peer.transport_addr_list,
+					    transports) {
+				if (trans->ipaddr.sa.sa_family != AF_INET6)
+					continue;
+				trans->flowlabel = params->spp_ipv6_flowlabel &
+						   SCTP_FLOWLABEL_VAL_MASK;
+				trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+			}
+			asoc->flowlabel = params->spp_ipv6_flowlabel &
+					  SCTP_FLOWLABEL_VAL_MASK;
+			asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+		} else if (sctp_opt2sk(sp)->sk_family == AF_INET6) {
+			sp->flowlabel = params->spp_ipv6_flowlabel &
+					SCTP_FLOWLABEL_VAL_MASK;
+			sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+		}
+	}
+
+	if (params->spp_flags & SPP_DSCP) {
+		if (trans) {
+			trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+			trans->dscp |= SCTP_DSCP_SET_MASK;
+		} else if (asoc) {
+			list_for_each_entry(trans,
+					    &asoc->peer.transport_addr_list,
+					    transports) {
+				trans->dscp = params->spp_dscp &
+					      SCTP_DSCP_VAL_MASK;
+				trans->dscp |= SCTP_DSCP_SET_MASK;
+			}
+			asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+			asoc->dscp |= SCTP_DSCP_SET_MASK;
+		} else {
+			sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+			sp->dscp |= SCTP_DSCP_SET_MASK;
+		}
+	}
+
 	return 0;
 }
 
@@ -2625,11 +2711,18 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk,
 	int error;
 	int hb_change, pmtud_change, sackdelay_change;
 
-	if (optlen != sizeof(struct sctp_paddrparams))
+	if (optlen == sizeof(params)) {
+		if (copy_from_user(&params, optval, optlen))
+			return -EFAULT;
+	} else if (optlen == ALIGN(offsetof(struct sctp_paddrparams,
+					    spp_ipv6_flowlabel), 4)) {
+		if (copy_from_user(&params, optval, optlen))
+			return -EFAULT;
+		if (params.spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL))
+			return -EINVAL;
+	} else {
 		return -EINVAL;
-
-	if (copy_from_user(&params, optval, optlen))
-		return -EFAULT;
+	}
 
 	/* Validate flags and value parameters. */
 	hb_change        = params.spp_flags & SPP_HB;
@@ -5453,6 +5546,45 @@ out:
  *                     also that this field is mutually exclusive to
  *                     SPP_SACKDELAY_ENABLE, setting both will have undefined
  *                     results.
+ *
+ *                     SPP_IPV6_FLOWLABEL:  Setting this flag enables the
+ *                     setting of the IPV6 flow label value.  The value is
+ *                     contained in the spp_ipv6_flowlabel field.
+ *                     Upon retrieval, this flag will be set to indicate that
+ *                     the spp_ipv6_flowlabel field has a valid value returned.
+ *                     If a specific destination address is set (in the
+ *                     spp_address field), then the value returned is that of
+ *                     the address.  If just an association is specified (and
+ *                     no address), then the association's default flow label
+ *                     is returned.  If neither an association nor a destination
+ *                     is specified, then the socket's default flow label is
+ *                     returned.  For non-IPv6 sockets, this flag will be left
+ *                     cleared.
+ *
+ *                     SPP_DSCP:  Setting this flag enables the setting of the
+ *                     Differentiated Services Code Point (DSCP) value
+ *                     associated with either the association or a specific
+ *                     address.  The value is obtained in the spp_dscp field.
+ *                     Upon retrieval, this flag will be set to indicate that
+ *                     the spp_dscp field has a valid value returned.  If a
+ *                     specific destination address is set when called (in the
+ *                     spp_address field), then that specific destination
+ *                     address's DSCP value is returned.  If just an association
+ *                     is specified, then the association's default DSCP is
+ *                     returned.  If neither an association nor a destination is
+ *                     specified, then the socket's default DSCP is returned.
+ *
+ *   spp_ipv6_flowlabel
+ *                   - This field is used in conjunction with the
+ *                     SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
+ *                     The 20 least significant bits are used for the flow
+ *                     label.  This setting has precedence over any IPv6-layer
+ *                     setting.
+ *
+ *   spp_dscp        - This field is used in conjunction with the SPP_DSCP flag
+ *                     and contains the DSCP.  The 6 most significant bits are
+ *                     used for the DSCP.  This setting has precedence over any
+ *                     IPv4- or IPv6- layer setting.
  */
 static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
 					    char __user *optval, int __user *optlen)
@@ -5462,9 +5594,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
 	struct sctp_association *asoc = NULL;
 	struct sctp_sock        *sp = sctp_sk(sk);
 
-	if (len < sizeof(struct sctp_paddrparams))
+	if (len >= sizeof(params))
+		len = sizeof(params);
+	else if (len >= ALIGN(offsetof(struct sctp_paddrparams,
+				       spp_ipv6_flowlabel), 4))
+		len = ALIGN(offsetof(struct sctp_paddrparams,
+				     spp_ipv6_flowlabel), 4);
+	else
 		return -EINVAL;
-	len = sizeof(struct sctp_paddrparams);
+
 	if (copy_from_user(&params, optval, len))
 		return -EFAULT;
 
@@ -5499,6 +5637,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
 
 		/*draft-11 doesn't say what to return in spp_flags*/
 		params.spp_flags      = trans->param_flags;
+		if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+			params.spp_ipv6_flowlabel = trans->flowlabel &
+						    SCTP_FLOWLABEL_VAL_MASK;
+			params.spp_flags |= SPP_IPV6_FLOWLABEL;
+		}
+		if (trans->dscp & SCTP_DSCP_SET_MASK) {
+			params.spp_dscp	= trans->dscp & SCTP_DSCP_VAL_MASK;
+			params.spp_flags |= SPP_DSCP;
+		}
 	} else if (asoc) {
 		/* Fetch association values. */
 		params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
@@ -5508,6 +5655,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
 
 		/*draft-11 doesn't say what to return in spp_flags*/
 		params.spp_flags      = asoc->param_flags;
+		if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+			params.spp_ipv6_flowlabel = asoc->flowlabel &
+						    SCTP_FLOWLABEL_VAL_MASK;
+			params.spp_flags |= SPP_IPV6_FLOWLABEL;
+		}
+		if (asoc->dscp & SCTP_DSCP_SET_MASK) {
+			params.spp_dscp	= asoc->dscp & SCTP_DSCP_VAL_MASK;
+			params.spp_flags |= SPP_DSCP;
+		}
 	} else {
 		/* Fetch socket values. */
 		params.spp_hbinterval = sp->hbinterval;
@@ -5517,6 +5673,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
 
 		/*draft-11 doesn't say what to return in spp_flags*/
 		params.spp_flags      = sp->param_flags;
+		if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+			params.spp_ipv6_flowlabel = sp->flowlabel &
+						    SCTP_FLOWLABEL_VAL_MASK;
+			params.spp_flags |= SPP_IPV6_FLOWLABEL;
+		}
+		if (sp->dscp & SCTP_DSCP_SET_MASK) {
+			params.spp_dscp	= sp->dscp & SCTP_DSCP_VAL_MASK;
+			params.spp_flags |= SPP_DSCP;
+		}
 	}
 
 	if (copy_to_user(optval, &params, len))
-- 
cgit v1.2.3


From 421860b9d47053badce4b247576fa48df9ab4c48 Mon Sep 17 00:00:00 2001
From: Steve Longerbeam <slongerbeam@gmail.com>
Date: Wed, 27 Jun 2018 14:39:43 -0400
Subject: media: v4l2-ctrls: Fix CID base conflict between MAX217X and IMX

When the imx-media driver was initially merged, there was a conflict
with 8d67ae25 ("media: v4l2-ctrls: Reserve controls for MAX217X") which
was not fixed up correctly, resulting in V4L2_CID_USER_MAX217X_BASE and
V4L2_CID_USER_IMX_BASE taking on the same value. Fix by assigning imx
CID base the next available range at 0x10b0.

Signed-off-by: Steve Longerbeam <steve_longerbeam@mentor.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/uapi/linux/v4l2-controls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 8d473c979b61..8a75ad7899f3 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -188,7 +188,7 @@ enum v4l2_colorfx {
 
 /* The base for the imx driver controls.
  * We reserve 16 controls for this driver. */
-#define V4L2_CID_USER_IMX_BASE			(V4L2_CID_USER_BASE + 0x1090)
+#define V4L2_CID_USER_IMX_BASE			(V4L2_CID_USER_BASE + 0x10b0)
 
 /* MPEG-class control IDs */
 /* The MPEG controls are applicable to all codec controls
-- 
cgit v1.2.3


From e7e3728bd776d1d1450212ad266832f1003f833f Mon Sep 17 00:00:00 2001
From: Qiaobin Fu <qiaobinf@bu.edu>
Date: Sun, 1 Jul 2018 15:16:27 -0400
Subject: net:sched: add action inheritdsfield to skbedit

The new action inheritdsfield copies the field DS of
IPv4 and IPv6 packets into skb->priority. This enables
later classification of packets based on the DS field.

v5:
*Update the drop counter for TC_ACT_SHOT

v4:
*Not allow setting flags other than the expected ones.

*Allow dumping the pure flags.

v3:
*Use optional flags, so that it won't break old versions of tc.

*Allow users to set both SKBEDIT_F_PRIORITY and SKBEDIT_F_INHERITDSFIELD flags.

v2:
*Fix the style issue

*Move the code from skbmod to skbedit

Original idea by Jamal Hadi Salim <jhs@mojatatu.com>

Signed-off-by: Qiaobin Fu <qiaobinf@bu.edu>
Reviewed-by: Michel Machado <michel@digirati.com.br>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_skbedit.h |  2 ++
 net/sched/act_skbedit.c                | 41 ++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h
index fbcfe27a4e6c..6de6071ebed6 100644
--- a/include/uapi/linux/tc_act/tc_skbedit.h
+++ b/include/uapi/linux/tc_act/tc_skbedit.h
@@ -30,6 +30,7 @@
 #define SKBEDIT_F_MARK			0x4
 #define SKBEDIT_F_PTYPE			0x8
 #define SKBEDIT_F_MASK			0x10
+#define SKBEDIT_F_INHERITDSFIELD	0x20
 
 struct tc_skbedit {
 	tc_gen;
@@ -45,6 +46,7 @@ enum {
 	TCA_SKBEDIT_PAD,
 	TCA_SKBEDIT_PTYPE,
 	TCA_SKBEDIT_MASK,
+	TCA_SKBEDIT_FLAGS,
 	__TCA_SKBEDIT_MAX
 };
 #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6138d1d71900..dfaf5d8028dd 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -23,6 +23,9 @@
 #include <linux/rtnetlink.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/dsfield.h>
 
 #include <linux/tc_act/tc_skbedit.h>
 #include <net/tc_act/tc_skbedit.h>
@@ -41,6 +44,25 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 
 	if (d->flags & SKBEDIT_F_PRIORITY)
 		skb->priority = d->priority;
+	if (d->flags & SKBEDIT_F_INHERITDSFIELD) {
+		int wlen = skb_network_offset(skb);
+
+		switch (tc_skb_protocol(skb)) {
+		case htons(ETH_P_IP):
+			wlen += sizeof(struct iphdr);
+			if (!pskb_may_pull(skb, wlen))
+				goto err;
+			skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+			break;
+
+		case htons(ETH_P_IPV6):
+			wlen += sizeof(struct ipv6hdr);
+			if (!pskb_may_pull(skb, wlen))
+				goto err;
+			skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+			break;
+		}
+	}
 	if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
 	    skb->dev->real_num_tx_queues > d->queue_mapping)
 		skb_set_queue_mapping(skb, d->queue_mapping);
@@ -53,6 +75,11 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 
 	spin_unlock(&d->tcf_lock);
 	return d->tcf_action;
+
+err:
+	d->tcf_qstats.drops++;
+	spin_unlock(&d->tcf_lock);
+	return TC_ACT_SHOT;
 }
 
 static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
@@ -62,6 +89,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 	[TCA_SKBEDIT_MARK]		= { .len = sizeof(u32) },
 	[TCA_SKBEDIT_PTYPE]		= { .len = sizeof(u16) },
 	[TCA_SKBEDIT_MASK]		= { .len = sizeof(u32) },
+	[TCA_SKBEDIT_FLAGS]		= { .len = sizeof(u64) },
 };
 
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
@@ -114,6 +142,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		mask = nla_data(tb[TCA_SKBEDIT_MASK]);
 	}
 
+	if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
+		u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
+
+		if (*pure_flags & SKBEDIT_F_INHERITDSFIELD)
+			flags |= SKBEDIT_F_INHERITDSFIELD;
+	}
+
 	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
 
 	exists = tcf_idr_check(tn, parm->index, a, bind);
@@ -178,6 +213,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 		.action  = d->tcf_action,
 	};
 	struct tcf_t t;
+	u64 pure_flags = 0;
 
 	if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
@@ -196,6 +232,11 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 	if ((d->flags & SKBEDIT_F_MASK) &&
 	    nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask))
 		goto nla_put_failure;
+	if (d->flags & SKBEDIT_F_INHERITDSFIELD)
+		pure_flags |= SKBEDIT_F_INHERITDSFIELD;
+	if (pure_flags != 0 &&
+	    nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags))
+		goto nla_put_failure;
 
 	tcf_tm_dump(&t, &d->tcf_tm);
 	if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
-- 
cgit v1.2.3


From 5520b9467a39d5ec9ce9cd8a9ed01f826b817524 Mon Sep 17 00:00:00 2001
From: Keiichi Watanabe <keiichiw@chromium.org>
Date: Mon, 18 Jun 2018 03:58:52 -0400
Subject: media: v4l2-ctrl: Change control for VP8 profile to menu control

Add a menu control V4L2_CID_MPEG_VIDEO_VP8_PROFILE for VP8 profile and make
V4L2_CID_MPEG_VIDEO_VPX_PROFILE an alias of it. This new control is used to
select the desired profile for VP8 encoder and query for supported profiles by
VP8 encoder/decoder.

Though we have originally a control V4L2_CID_MPEG_VIDEO_VPX_PROFILE and its name
contains 'VPX', it works only for VP8 because supported profiles usually differ
between VP8 and VP9. In addition, this control cannot be used for querying since
it is not a menu control but an integer control, which cannot return an
arbitrary set of supported profiles.

The new control V4L2_CID_MPEG_VIDEO_VP8_PROFILE is a menu control as with
controls for other codec profiles. (e.g. H264)

In addition, this patch also fixes the use of V4L2_CID_MPEG_VIDEO_VPX_PROFILE in
drivers of Qualcomm's venus and Samsung's s5p-mfc.

Signed-off-by: Keiichi Watanabe <keiichiw@chromium.org>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/extended-controls.rst | 25 +++++++++++++++++++---
 drivers/media/platform/qcom/venus/vdec_ctrls.c     | 10 +++++----
 drivers/media/platform/qcom/venus/venc.c           |  4 ++--
 drivers/media/platform/qcom/venus/venc_ctrls.c     | 10 +++++----
 drivers/media/platform/s5p-mfc/s5p_mfc_enc.c       | 15 ++++++-------
 drivers/media/v4l2-core/v4l2-ctrls.c               | 12 ++++++++++-
 include/uapi/linux/v4l2-controls.h                 | 11 +++++++++-
 7 files changed, 64 insertions(+), 23 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst
index 03931f9b1285..01ef31a934b4 100644
--- a/Documentation/media/uapi/v4l/extended-controls.rst
+++ b/Documentation/media/uapi/v4l/extended-controls.rst
@@ -1955,9 +1955,28 @@ enum v4l2_vp8_golden_frame_sel -
 ``V4L2_CID_MPEG_VIDEO_VPX_P_FRAME_QP (integer)``
     Quantization parameter for a P frame for VP8.
 
-``V4L2_CID_MPEG_VIDEO_VPX_PROFILE (integer)``
-    Select the desired profile for VPx encoder. Acceptable values are 0,
-    1, 2 and 3 corresponding to encoder profiles 0, 1, 2 and 3.
+.. _v4l2-mpeg-video-vp8-profile:
+
+``V4L2_CID_MPEG_VIDEO_VP8_PROFILE``
+    (enum)
+
+enum v4l2_mpeg_video_vp8_profile -
+    This control allows selecting the profile for VP8 encoder.
+    This is also used to enumerate supported profiles by VP8 encoder or decoder.
+    Possible values are:
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+
+    * - ``V4L2_MPEG_VIDEO_VP8_PROFILE_0``
+      - Profile 0
+    * - ``V4L2_MPEG_VIDEO_VP8_PROFILE_1``
+      - Profile 1
+    * - ``V4L2_MPEG_VIDEO_VP8_PROFILE_2``
+      - Profile 2
+    * - ``V4L2_MPEG_VIDEO_VP8_PROFILE_3``
+      - Profile 3
 
 
 High Efficiency Video Coding (HEVC/H.265) Control Reference
diff --git a/drivers/media/platform/qcom/venus/vdec_ctrls.c b/drivers/media/platform/qcom/venus/vdec_ctrls.c
index 032839bbc967..f4604b0cd57e 100644
--- a/drivers/media/platform/qcom/venus/vdec_ctrls.c
+++ b/drivers/media/platform/qcom/venus/vdec_ctrls.c
@@ -29,7 +29,7 @@ static int vdec_op_s_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	case V4L2_CID_MPEG_VIDEO_H264_PROFILE:
 	case V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE:
-	case V4L2_CID_MPEG_VIDEO_VPX_PROFILE:
+	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:
 		ctr->profile = ctrl->val;
 		break;
 	case V4L2_CID_MPEG_VIDEO_H264_LEVEL:
@@ -54,7 +54,7 @@ static int vdec_op_g_volatile_ctrl(struct v4l2_ctrl *ctrl)
 	switch (ctrl->id) {
 	case V4L2_CID_MPEG_VIDEO_H264_PROFILE:
 	case V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE:
-	case V4L2_CID_MPEG_VIDEO_VPX_PROFILE:
+	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:
 		ret = hfi_session_get_property(inst, ptype, &hprop);
 		if (!ret)
 			ctr->profile = hprop.profile_level.profile;
@@ -130,8 +130,10 @@ int vdec_ctrl_init(struct venus_inst *inst)
 	if (ctrl)
 		ctrl->flags |= V4L2_CTRL_FLAG_VOLATILE;
 
-	ctrl = v4l2_ctrl_new_std(&inst->ctrl_handler, &vdec_ctrl_ops,
-				 V4L2_CID_MPEG_VIDEO_VPX_PROFILE, 0, 3, 1, 0);
+	ctrl = v4l2_ctrl_new_std_menu(&inst->ctrl_handler, &vdec_ctrl_ops,
+				      V4L2_CID_MPEG_VIDEO_VP8_PROFILE,
+				      V4L2_MPEG_VIDEO_VP8_PROFILE_3,
+				      0, V4L2_MPEG_VIDEO_VP8_PROFILE_0);
 	if (ctrl)
 		ctrl->flags |= V4L2_CTRL_FLAG_VOLATILE;
 
diff --git a/drivers/media/platform/qcom/venus/venc.c b/drivers/media/platform/qcom/venus/venc.c
index 11dafc7848c5..f7a87a3dbb46 100644
--- a/drivers/media/platform/qcom/venus/venc.c
+++ b/drivers/media/platform/qcom/venus/venc.c
@@ -223,7 +223,7 @@ static int venc_v4l2_to_hfi(int id, int value)
 		case V4L2_MPEG_VIDEO_H264_ENTROPY_MODE_CABAC:
 			return HFI_H264_ENTROPY_CABAC;
 		}
-	case V4L2_CID_MPEG_VIDEO_VPX_PROFILE:
+	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:
 		switch (value) {
 		case 0:
 		default:
@@ -754,7 +754,7 @@ static int venc_set_properties(struct venus_inst *inst)
 		level = venc_v4l2_to_hfi(V4L2_CID_MPEG_VIDEO_H264_LEVEL,
 					 ctr->level.h264);
 	} else if (inst->fmt_cap->pixfmt == V4L2_PIX_FMT_VP8) {
-		profile = venc_v4l2_to_hfi(V4L2_CID_MPEG_VIDEO_VPX_PROFILE,
+		profile = venc_v4l2_to_hfi(V4L2_CID_MPEG_VIDEO_VP8_PROFILE,
 					   ctr->profile.vpx);
 		level = 0;
 	} else if (inst->fmt_cap->pixfmt == V4L2_PIX_FMT_MPEG4) {
diff --git a/drivers/media/platform/qcom/venus/venc_ctrls.c b/drivers/media/platform/qcom/venus/venc_ctrls.c
index 21e938a28662..459101728d26 100644
--- a/drivers/media/platform/qcom/venus/venc_ctrls.c
+++ b/drivers/media/platform/qcom/venus/venc_ctrls.c
@@ -101,7 +101,7 @@ static int venc_op_s_ctrl(struct v4l2_ctrl *ctrl)
 	case V4L2_CID_MPEG_VIDEO_H264_PROFILE:
 		ctr->profile.h264 = ctrl->val;
 		break;
-	case V4L2_CID_MPEG_VIDEO_VPX_PROFILE:
+	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:
 		ctr->profile.vpx = ctrl->val;
 		break;
 	case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL:
@@ -248,6 +248,11 @@ int venc_ctrl_init(struct venus_inst *inst)
 		V4L2_MPEG_VIDEO_MULTI_SICE_MODE_MAX_BYTES,
 		0, V4L2_MPEG_VIDEO_MULTI_SLICE_MODE_SINGLE);
 
+	v4l2_ctrl_new_std_menu(&inst->ctrl_handler, &venc_ctrl_ops,
+		V4L2_CID_MPEG_VIDEO_VP8_PROFILE,
+		V4L2_MPEG_VIDEO_VP8_PROFILE_3,
+		0, V4L2_MPEG_VIDEO_VP8_PROFILE_0);
+
 	v4l2_ctrl_new_std(&inst->ctrl_handler, &venc_ctrl_ops,
 		V4L2_CID_MPEG_VIDEO_BITRATE, BITRATE_MIN, BITRATE_MAX,
 		BITRATE_STEP, BITRATE_DEFAULT);
@@ -256,9 +261,6 @@ int venc_ctrl_init(struct venus_inst *inst)
 		V4L2_CID_MPEG_VIDEO_BITRATE_PEAK, BITRATE_MIN, BITRATE_MAX,
 		BITRATE_STEP, BITRATE_DEFAULT_PEAK);
 
-	v4l2_ctrl_new_std(&inst->ctrl_handler, &venc_ctrl_ops,
-		V4L2_CID_MPEG_VIDEO_VPX_PROFILE, 0, 3, 1, 0);
-
 	v4l2_ctrl_new_std(&inst->ctrl_handler, &venc_ctrl_ops,
 		V4L2_CID_MPEG_VIDEO_H264_I_FRAME_QP, 1, 51, 1, 26);
 
diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c b/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c
index 570f391f2cfd..3ad4f5073002 100644
--- a/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c
+++ b/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c
@@ -692,12 +692,12 @@ static struct mfc_control controls[] = {
 		.default_value = 10,
 	},
 	{
-		.id = V4L2_CID_MPEG_VIDEO_VPX_PROFILE,
-		.type = V4L2_CTRL_TYPE_INTEGER,
-		.minimum = 0,
-		.maximum = 3,
-		.step = 1,
-		.default_value = 0,
+		.id = V4L2_CID_MPEG_VIDEO_VP8_PROFILE,
+		.type = V4L2_CTRL_TYPE_MENU,
+		.minimum = V4L2_MPEG_VIDEO_VP8_PROFILE_0,
+		.maximum = V4L2_MPEG_VIDEO_VP8_PROFILE_3,
+		.default_value = V4L2_MPEG_VIDEO_VP8_PROFILE_0,
+		.menu_skip_mask = 0,
 	},
 	{
 		.id = V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_QP,
@@ -2057,7 +2057,7 @@ static int s5p_mfc_enc_s_ctrl(struct v4l2_ctrl *ctrl)
 	case V4L2_CID_MPEG_VIDEO_VPX_P_FRAME_QP:
 		p->codec.vp8.rc_p_frame_qp = ctrl->val;
 		break;
-	case V4L2_CID_MPEG_VIDEO_VPX_PROFILE:
+	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:
 		p->codec.vp8.profile = ctrl->val;
 		break;
 	case V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_QP:
@@ -2711,4 +2711,3 @@ void s5p_mfc_enc_init(struct s5p_mfc_ctx *ctx)
 	f.fmt.pix_mp.pixelformat = DEF_DST_FMT_ENC;
 	ctx->dst_fmt = find_format(&f, MFC_FMT_ENC);
 }
-
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index d1087573da34..4a182a70de36 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -431,6 +431,13 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
 		"Use Previous Specific Frame",
 		NULL,
 	};
+	static const char * const vp8_profile[] = {
+		"0",
+		"1",
+		"2",
+		"3",
+		NULL,
+	};
 
 	static const char * const flash_led_mode[] = {
 		"Off",
@@ -614,6 +621,8 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
 		return mpeg4_profile;
 	case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL:
 		return vpx_golden_frame_sel;
+	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:
+		return vp8_profile;
 	case V4L2_CID_JPEG_CHROMA_SUBSAMPLING:
 		return jpeg_chroma_subsampling;
 	case V4L2_CID_DV_TX_MODE:
@@ -839,7 +848,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_VPX_MAX_QP:			return "VPX Maximum QP Value";
 	case V4L2_CID_MPEG_VIDEO_VPX_I_FRAME_QP:		return "VPX I-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_VPX_P_FRAME_QP:		return "VPX P-Frame QP Value";
-	case V4L2_CID_MPEG_VIDEO_VPX_PROFILE:			return "VPX Profile";
+	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:			return "VP8 Profile";
 
 	/* HEVC controls */
 	case V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_QP:		return "HEVC I-Frame QP Value";
@@ -1180,6 +1189,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_DEINTERLACING_MODE:
 	case V4L2_CID_TUNE_DEEMPHASIS:
 	case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL:
+	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:
 	case V4L2_CID_DETECT_MD_MODE:
 	case V4L2_CID_MPEG_VIDEO_HEVC_PROFILE:
 	case V4L2_CID_MPEG_VIDEO_HEVC_LEVEL:
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 8a75ad7899f3..ab96795b2829 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -587,7 +587,16 @@ enum v4l2_vp8_golden_frame_sel {
 #define V4L2_CID_MPEG_VIDEO_VPX_MAX_QP			(V4L2_CID_MPEG_BASE+508)
 #define V4L2_CID_MPEG_VIDEO_VPX_I_FRAME_QP		(V4L2_CID_MPEG_BASE+509)
 #define V4L2_CID_MPEG_VIDEO_VPX_P_FRAME_QP		(V4L2_CID_MPEG_BASE+510)
-#define V4L2_CID_MPEG_VIDEO_VPX_PROFILE			(V4L2_CID_MPEG_BASE+511)
+
+#define V4L2_CID_MPEG_VIDEO_VP8_PROFILE			(V4L2_CID_MPEG_BASE+511)
+enum v4l2_mpeg_video_vp8_profile {
+	V4L2_MPEG_VIDEO_VP8_PROFILE_0				= 0,
+	V4L2_MPEG_VIDEO_VP8_PROFILE_1				= 1,
+	V4L2_MPEG_VIDEO_VP8_PROFILE_2				= 2,
+	V4L2_MPEG_VIDEO_VP8_PROFILE_3				= 3,
+};
+/* Deprecated alias for compatibility reasons. */
+#define V4L2_CID_MPEG_VIDEO_VPX_PROFILE	V4L2_CID_MPEG_VIDEO_VP8_PROFILE
 
 /* CIDs for HEVC encoding. */
 
-- 
cgit v1.2.3


From 80b14dee2bea128928537d61c333f24cb8cbb62f Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Tue, 3 Jul 2018 15:42:48 -0700
Subject: net: Add a new socket option for a future transmit time.

This patch introduces SO_TXTIME. User space enables this option in
order to pass a desired future transmit time in a CMSG when calling
sendmsg(2). The argument to this socket option is a 8-bytes long struct
provided by the uapi header net_tstamp.h defined as:

struct sock_txtime {
	clockid_t 	clockid;
	u32		flags;
};

Note that new fields were added to struct sock by filling a 2-bytes
hole found in the struct. For that reason, neither the struct size or
number of cachelines were altered.

Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h  |  3 +++
 arch/ia64/include/uapi/asm/socket.h   |  3 +++
 arch/mips/include/uapi/asm/socket.h   |  3 +++
 arch/parisc/include/uapi/asm/socket.h |  3 +++
 arch/s390/include/uapi/asm/socket.h   |  3 +++
 arch/sparc/include/uapi/asm/socket.h  |  3 +++
 arch/xtensa/include/uapi/asm/socket.h |  3 +++
 include/net/sock.h                    | 10 ++++++++++
 include/uapi/asm-generic/socket.h     |  3 +++
 include/uapi/linux/net_tstamp.h       | 15 +++++++++++++++
 net/core/sock.c                       | 35 +++++++++++++++++++++++++++++++++++
 11 files changed, 84 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index be14f16149d5..065fb372e355 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -112,4 +112,7 @@
 
 #define SO_ZEROCOPY		60
 
+#define SO_TXTIME		61
+#define SCM_TXTIME		SO_TXTIME
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 3efba40adc54..c872c4e6bafb 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -114,4 +114,7 @@
 
 #define SO_ZEROCOPY		60
 
+#define SO_TXTIME		61
+#define SCM_TXTIME		SO_TXTIME
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 49c3d4795963..71370fb3ceef 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -123,4 +123,7 @@
 
 #define SO_ZEROCOPY		60
 
+#define SO_TXTIME		61
+#define SCM_TXTIME		SO_TXTIME
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 1d0fdc3b5d22..061b9cf2a779 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -104,4 +104,7 @@
 
 #define SO_ZEROCOPY		0x4035
 
+#define SO_TXTIME		0x4036
+#define SCM_TXTIME		SO_TXTIME
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index 3510c0fd06f4..39d901476ee5 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -111,4 +111,7 @@
 
 #define SO_ZEROCOPY		60
 
+#define SO_TXTIME		61
+#define SCM_TXTIME		SO_TXTIME
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index d58520c2e6ff..7ea35e5601b6 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -101,6 +101,9 @@
 
 #define SO_ZEROCOPY		0x003e
 
+#define SO_TXTIME		0x003f
+#define SCM_TXTIME		SO_TXTIME
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index 75a07b8119a9..1de07a7f7680 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -116,4 +116,7 @@
 
 #define SO_ZEROCOPY		60
 
+#define SO_TXTIME		61
+#define SCM_TXTIME		SO_TXTIME
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 2ed99bfa4595..68347b9821c6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -319,6 +319,9 @@ struct sock_common {
   *	@sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
   *	@sk_reuseport_cb: reuseport group container
   *	@sk_rcu: used during RCU grace period
+  *	@sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
+  *	@sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
+  *	@sk_txtime_unused: unused txtime flags
   */
 struct sock {
 	/*
@@ -475,6 +478,11 @@ struct sock {
 	u8			sk_shutdown;
 	u32			sk_tskey;
 	atomic_t		sk_zckey;
+
+	u8			sk_clockid;
+	u8			sk_txtime_deadline_mode : 1,
+				sk_txtime_unused : 7;
+
 	struct socket		*sk_socket;
 	void			*sk_user_data;
 #ifdef CONFIG_SECURITY
@@ -790,6 +798,7 @@ enum sock_flags {
 	SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */
 	SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
 	SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
+	SOCK_TXTIME,
 };
 
 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
@@ -1585,6 +1594,7 @@ void sock_kzfree_s(struct sock *sk, void *mem, int size);
 void sk_send_sigurg(struct sock *sk);
 
 struct sockcm_cookie {
+	u64 transmit_time;
 	u32 mark;
 	u16 tsflags;
 };
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 0ae758c90e54..a12692e5f7a8 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -107,4 +107,7 @@
 
 #define SO_ZEROCOPY		60
 
+#define SO_TXTIME		61
+#define SCM_TXTIME		SO_TXTIME
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 4fe104b2411f..c9a77c353b98 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -141,4 +141,19 @@ struct scm_ts_pktinfo {
 	__u32 reserved[2];
 };
 
+/*
+ * SO_TXTIME gets a struct sock_txtime with flags being an integer bit
+ * field comprised of these values.
+ */
+enum txtime_flags {
+	SOF_TXTIME_DEADLINE_MODE = (1 << 0),
+
+	SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_DEADLINE_MODE)
+};
+
+struct sock_txtime {
+	clockid_t       clockid;        /* reference clockid */
+	u32             flags;          /* flags defined by enum txtime_flags */
+};
+
 #endif /* _NET_TIMESTAMPING_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 6429982eb976..fe64b839f1b2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -91,6 +91,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <asm/unaligned.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/errqueue.h>
@@ -697,6 +698,7 @@ EXPORT_SYMBOL(sk_mc_loop);
 int sock_setsockopt(struct socket *sock, int level, int optname,
 		    char __user *optval, unsigned int optlen)
 {
+	struct sock_txtime sk_txtime;
 	struct sock *sk = sock->sk;
 	int val;
 	int valbool;
@@ -1070,6 +1072,24 @@ set_rcvbuf:
 		}
 		break;
 
+	case SO_TXTIME:
+		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+			ret = -EPERM;
+		} else if (optlen != sizeof(struct sock_txtime)) {
+			ret = -EINVAL;
+		} else if (copy_from_user(&sk_txtime, optval,
+			   sizeof(struct sock_txtime))) {
+			ret = -EFAULT;
+		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
+			ret = -EINVAL;
+		} else {
+			sock_valbool_flag(sk, SOCK_TXTIME, true);
+			sk->sk_clockid = sk_txtime.clockid;
+			sk->sk_txtime_deadline_mode =
+				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+		}
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -1115,6 +1135,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		u64 val64;
 		struct linger ling;
 		struct timeval tm;
+		struct sock_txtime txtime;
 	} v;
 
 	int lv = sizeof(int);
@@ -1403,6 +1424,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
 		break;
 
+	case SO_TXTIME:
+		lv = sizeof(v.txtime);
+		v.txtime.clockid = sk->sk_clockid;
+		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
+				  SOF_TXTIME_DEADLINE_MODE : 0;
+		break;
+
 	default:
 		/* We implement the SO_SNDLOWAT etc to not be settable
 		 * (1003.1g 7).
@@ -2137,6 +2165,13 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
 		sockc->tsflags |= tsflags;
 		break;
+	case SCM_TXTIME:
+		if (!sock_flag(sk, SOCK_TXTIME))
+			return -EINVAL;
+		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
+			return -EINVAL;
+		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
+		break;
 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
 	case SCM_RIGHTS:
 	case SCM_CREDENTIALS:
-- 
cgit v1.2.3


From 25db26a91364db00f5a30da2fea8e9afe14a163c Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 3 Jul 2018 15:42:53 -0700
Subject: net/sched: Introduce the ETF Qdisc

The ETF (Earliest TxTime First) qdisc uses the information added
earlier in this series (the socket option SO_TXTIME and the new
role of sk_buff->tstamp) to schedule packets transmission based
on absolute time.

For some workloads, just bandwidth enforcement is not enough, and
precise control of the transmission of packets is necessary.

Example:

$ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \
           map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0

$ tc qdisc add dev enp2s0 parent 100:1 etf delta 100000 \
           clockid CLOCK_TAI

In this example, the Qdisc will provide SW best-effort for the control
of the transmission time to the network adapter, the time stamp in the
socket will be in reference to the clockid CLOCK_TAI and packets
will leave the qdisc "delta" (100000) nanoseconds before its transmission
time.

The ETF qdisc will buffer packets sorted by their txtime. It will drop
packets on enqueue() if their skbuff clockid does not match the clock
reference of the Qdisc. Moreover, on dequeue(), a packet will be dropped
if it expires while being enqueued.

The qdisc also supports the SO_TXTIME deadline mode. For this mode, it
will dequeue a packet as soon as possible and change the skb timestamp
to 'now' during etf_dequeue().

Note that both the qdisc's and the SO_TXTIME ABIs allow for a clockid
to be configured, but it's been decided that usage of CLOCK_TAI should
be enforced until we decide to allow for other clockids to be used.
The rationale here is that PTP times are usually in the TAI scale, thus
no other clocks should be necessary. For now, the qdisc will return
EINVAL if any clocks other than CLOCK_TAI are used.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h      |   1 +
 include/uapi/linux/pkt_sched.h |  17 ++
 net/sched/Kconfig              |  11 ++
 net/sched/Makefile             |   1 +
 net/sched/sch_etf.c            | 384 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 414 insertions(+)
 create mode 100644 net/sched/sch_etf.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c1ef749b6f9f..f06ee8f91e74 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -798,6 +798,7 @@ enum tc_setup_type {
 	TC_SETUP_QDISC_RED,
 	TC_SETUP_QDISC_PRIO,
 	TC_SETUP_QDISC_MQ,
+	TC_SETUP_QDISC_ETF,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index bad3c03bcf43..d5e933ce1447 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -937,4 +937,21 @@ enum {
 
 #define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
 
+
+/* ETF */
+struct tc_etf_qopt {
+	__s32 delta;
+	__s32 clockid;
+	__u32 flags;
+#define TC_ETF_DEADLINE_MODE_ON	BIT(0)
+};
+
+enum {
+	TCA_ETF_UNSPEC,
+	TCA_ETF_PARMS,
+	__TCA_ETF_MAX,
+};
+
+#define TCA_ETF_MAX (__TCA_ETF_MAX - 1)
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169fb5325..fcc89706745b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -183,6 +183,17 @@ config NET_SCH_CBS
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_cbs.
 
+config NET_SCH_ETF
+	tristate "Earliest TxTime First (ETF)"
+	help
+	  Say Y here if you want to use the Earliest TxTime First (ETF) packet
+	  scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_etf.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_etf.
+
 config NET_SCH_GRED
 	tristate "Generic Random Early Detection (GRED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d3804878..9a5a7077d217 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
 obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
+obj-$(CONFIG_NET_SCH_ETF)	+= sch_etf.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
new file mode 100644
index 000000000000..4b7f4903ac17
--- /dev/null
+++ b/net/sched/sch_etf.c
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* net/sched/sch_etf.c  Earliest TxTime First queueing discipline.
+ *
+ * Authors:	Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
+ *		Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/rbtree.h>
+#include <linux/skbuff.h>
+#include <linux/posix-timers.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
+
+struct etf_sched_data {
+	bool deadline_mode;
+	int clockid;
+	int queue;
+	s32 delta; /* in ns */
+	ktime_t last; /* The txtime of the last skb sent to the netdevice. */
+	struct rb_root head;
+	struct qdisc_watchdog watchdog;
+	ktime_t (*get_time)(void);
+};
+
+static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = {
+	[TCA_ETF_PARMS]	= { .len = sizeof(struct tc_etf_qopt) },
+};
+
+static inline int validate_input_params(struct tc_etf_qopt *qopt,
+					struct netlink_ext_ack *extack)
+{
+	/* Check if params comply to the following rules:
+	 *	* Clockid and delta must be valid.
+	 *
+	 *	* Dynamic clockids are not supported.
+	 *
+	 *	* Delta must be a positive integer.
+	 */
+	if (qopt->clockid < 0) {
+		NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported");
+		return -ENOTSUPP;
+	}
+
+	if (qopt->clockid != CLOCK_TAI) {
+		NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used");
+		return -EINVAL;
+	}
+
+	if (qopt->delta < 0) {
+		NL_SET_ERR_MSG(extack, "Delta must be positive");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	ktime_t txtime = nskb->tstamp;
+	struct sock *sk = nskb->sk;
+	ktime_t now;
+
+	if (!sk)
+		return false;
+
+	if (!sock_flag(sk, SOCK_TXTIME))
+		return false;
+
+	/* We don't perform crosstimestamping.
+	 * Drop if packet's clockid differs from qdisc's.
+	 */
+	if (sk->sk_clockid != q->clockid)
+		return false;
+
+	if (sk->sk_txtime_deadline_mode != q->deadline_mode)
+		return false;
+
+	now = q->get_time();
+	if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
+		return false;
+
+	return true;
+}
+
+static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct rb_node *p;
+
+	p = rb_first(&q->head);
+	if (!p)
+		return NULL;
+
+	return rb_to_skb(p);
+}
+
+static void reset_watchdog(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = etf_peek_timesortedlist(sch);
+	ktime_t next;
+
+	if (!skb)
+		return;
+
+	next = ktime_sub_ns(skb->tstamp, q->delta);
+	qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
+}
+
+static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
+				      struct sk_buff **to_free)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct rb_node **p = &q->head.rb_node, *parent = NULL;
+	ktime_t txtime = nskb->tstamp;
+
+	if (!is_packet_valid(sch, nskb))
+		return qdisc_drop(nskb, sch, to_free);
+
+	while (*p) {
+		struct sk_buff *skb;
+
+		parent = *p;
+		skb = rb_to_skb(parent);
+		if (ktime_after(txtime, skb->tstamp))
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&nskb->rbnode, parent, p);
+	rb_insert_color(&nskb->rbnode, &q->head);
+
+	qdisc_qstats_backlog_inc(sch, nskb);
+	sch->q.qlen++;
+
+	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
+	reset_watchdog(sch);
+
+	return NET_XMIT_SUCCESS;
+}
+
+static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
+				 bool drop)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+
+	rb_erase(&skb->rbnode, &q->head);
+
+	/* The rbnode field in the skb re-uses these fields, now that
+	 * we are done with the rbnode, reset them.
+	 */
+	skb->next = NULL;
+	skb->prev = NULL;
+	skb->dev = qdisc_dev(sch);
+
+	qdisc_qstats_backlog_dec(sch, skb);
+
+	if (drop) {
+		struct sk_buff *to_free = NULL;
+
+		qdisc_drop(skb, sch, &to_free);
+		kfree_skb_list(to_free);
+		qdisc_qstats_overlimit(sch);
+	} else {
+		qdisc_bstats_update(sch, skb);
+
+		q->last = skb->tstamp;
+	}
+
+	sch->q.qlen--;
+}
+
+static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	ktime_t now, next;
+
+	skb = etf_peek_timesortedlist(sch);
+	if (!skb)
+		return NULL;
+
+	now = q->get_time();
+
+	/* Drop if packet has expired while in queue. */
+	/* FIXME: Must return error on the socket's error queue */
+	if (ktime_before(skb->tstamp, now)) {
+		timesortedlist_erase(sch, skb, true);
+		skb = NULL;
+		goto out;
+	}
+
+	/* When in deadline mode, dequeue as soon as possible and change the
+	 * txtime from deadline to (now + delta).
+	 */
+	if (q->deadline_mode) {
+		timesortedlist_erase(sch, skb, false);
+		skb->tstamp = now;
+		goto out;
+	}
+
+	next = ktime_sub_ns(skb->tstamp, q->delta);
+
+	/* Dequeue only if now is within the [txtime - delta, txtime] range. */
+	if (ktime_after(now, next))
+		timesortedlist_erase(sch, skb, false);
+	else
+		skb = NULL;
+
+out:
+	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
+	reset_watchdog(sch);
+
+	return skb;
+}
+
+static int etf_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct nlattr *tb[TCA_ETF_MAX + 1];
+	struct tc_etf_qopt *qopt;
+	int err;
+
+	if (!opt) {
+		NL_SET_ERR_MSG(extack,
+			       "Missing ETF qdisc options which are mandatory");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_ETF_PARMS]) {
+		NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters");
+		return -EINVAL;
+	}
+
+	qopt = nla_data(tb[TCA_ETF_PARMS]);
+
+	pr_debug("delta %d clockid %d deadline %s\n",
+		 qopt->delta, qopt->clockid,
+		 DEADLINE_MODE_IS_ON(qopt) ? "on" : "off");
+
+	err = validate_input_params(qopt, extack);
+	if (err < 0)
+		return err;
+
+	q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+	/* Everything went OK, save the parameters used. */
+	q->delta = qopt->delta;
+	q->clockid = qopt->clockid;
+	q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+
+	switch (q->clockid) {
+	case CLOCK_REALTIME:
+		q->get_time = ktime_get_real;
+		break;
+	case CLOCK_MONOTONIC:
+		q->get_time = ktime_get;
+		break;
+	case CLOCK_BOOTTIME:
+		q->get_time = ktime_get_boottime;
+		break;
+	case CLOCK_TAI:
+		q->get_time = ktime_get_clocktai;
+		break;
+	default:
+		NL_SET_ERR_MSG(extack, "Clockid is not supported");
+		return -ENOTSUPP;
+	}
+
+	qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid);
+
+	return 0;
+}
+
+static void timesortedlist_clear(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct rb_node *p = rb_first(&q->head);
+
+	while (p) {
+		struct sk_buff *skb = rb_to_skb(p);
+
+		p = rb_next(p);
+
+		rb_erase(&skb->rbnode, &q->head);
+		rtnl_kfree_skbs(skb, skb);
+		sch->q.qlen--;
+	}
+}
+
+static void etf_reset(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+
+	/* Only cancel watchdog if it's been initialized. */
+	if (q->watchdog.qdisc == sch)
+		qdisc_watchdog_cancel(&q->watchdog);
+
+	/* No matter which mode we are on, it's safe to clear both lists. */
+	timesortedlist_clear(sch);
+	__qdisc_reset_queue(&sch->q);
+
+	sch->qstats.backlog = 0;
+	sch->q.qlen = 0;
+
+	q->last = 0;
+}
+
+static void etf_destroy(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+
+	/* Only cancel watchdog if it's been initialized. */
+	if (q->watchdog.qdisc == sch)
+		qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct tc_etf_qopt opt = { };
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	opt.delta = q->delta;
+	opt.clockid = q->clockid;
+	if (q->deadline_mode)
+		opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+
+	if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct Qdisc_ops etf_qdisc_ops __read_mostly = {
+	.id		=	"etf",
+	.priv_size	=	sizeof(struct etf_sched_data),
+	.enqueue	=	etf_enqueue_timesortedlist,
+	.dequeue	=	etf_dequeue_timesortedlist,
+	.peek		=	etf_peek_timesortedlist,
+	.init		=	etf_init,
+	.reset		=	etf_reset,
+	.destroy	=	etf_destroy,
+	.dump		=	etf_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init etf_module_init(void)
+{
+	return register_qdisc(&etf_qdisc_ops);
+}
+
+static void __exit etf_module_exit(void)
+{
+	unregister_qdisc(&etf_qdisc_ops);
+}
+module_init(etf_module_init)
+module_exit(etf_module_exit)
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 88cab77162e86e0f6a2b7e4f859c1435c4e24feb Mon Sep 17 00:00:00 2001
From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Date: Tue, 3 Jul 2018 15:42:54 -0700
Subject: net/sched: Add HW offloading capability to ETF

Add infra so etf qdisc supports HW offload of time-based transmission.

For hw offload, the time sorted list is still used, so packets are
dequeued always in order of txtime.

Example:

$ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \
           map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0

$ tc qdisc add dev enp2s0 parent 100:1 etf offload delta 100000 \
	   clockid CLOCK_REALTIME

In this example, the Qdisc will use HW offload for the control of the
transmission time through the network adapter. The hrtimer used for
packets scheduling inside the qdisc will use the clockid CLOCK_REALTIME
as reference and packets leave the Qdisc "delta" (100000) nanoseconds
before their transmission time. Because this will be using HW offload and
since dynamic clocks are not supported by the hrtimer, the system clock
and the PHC clock must be synchronized for this mode to behave as
expected.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h        |  5 +++
 include/uapi/linux/pkt_sched.h |  1 +
 net/sched/sch_etf.c            | 71 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 76 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 2466ea143d01..7dc769e5452b 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -155,4 +155,9 @@ struct tc_cbs_qopt_offload {
 	s32 sendslope;
 };
 
+struct tc_etf_qopt_offload {
+	u8 enable;
+	s32 queue;
+};
+
 #endif
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index d5e933ce1447..949118461009 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -944,6 +944,7 @@ struct tc_etf_qopt {
 	__s32 clockid;
 	__u32 flags;
 #define TC_ETF_DEADLINE_MODE_ON	BIT(0)
+#define TC_ETF_OFFLOAD_ON	BIT(1)
 };
 
 enum {
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index 4b7f4903ac17..932a136db568 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -20,8 +20,10 @@
 #include <net/sock.h>
 
 #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
+#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON)
 
 struct etf_sched_data {
+	bool offload;
 	bool deadline_mode;
 	int clockid;
 	int queue;
@@ -45,6 +47,9 @@ static inline int validate_input_params(struct tc_etf_qopt *qopt,
 	 *	* Dynamic clockids are not supported.
 	 *
 	 *	* Delta must be a positive integer.
+	 *
+	 * Also note that for the HW offload case, we must
+	 * expect that system clocks have been synchronized to PHC.
 	 */
 	if (qopt->clockid < 0) {
 		NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported");
@@ -225,6 +230,56 @@ out:
 	return skb;
 }
 
+static void etf_disable_offload(struct net_device *dev,
+				struct etf_sched_data *q)
+{
+	struct tc_etf_qopt_offload etf = { };
+	const struct net_device_ops *ops;
+	int err;
+
+	if (!q->offload)
+		return;
+
+	ops = dev->netdev_ops;
+	if (!ops->ndo_setup_tc)
+		return;
+
+	etf.queue = q->queue;
+	etf.enable = 0;
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
+	if (err < 0)
+		pr_warn("Couldn't disable ETF offload for queue %d\n",
+			etf.queue);
+}
+
+static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q,
+			      struct netlink_ext_ack *extack)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	struct tc_etf_qopt_offload etf = { };
+	int err;
+
+	if (q->offload)
+		return 0;
+
+	if (!ops->ndo_setup_tc) {
+		NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload");
+		return -EOPNOTSUPP;
+	}
+
+	etf.queue = q->queue;
+	etf.enable = 1;
+
+	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload");
+		return err;
+	}
+
+	return 0;
+}
+
 static int etf_init(struct Qdisc *sch, struct nlattr *opt,
 		    struct netlink_ext_ack *extack)
 {
@@ -251,8 +306,9 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt,
 
 	qopt = nla_data(tb[TCA_ETF_PARMS]);
 
-	pr_debug("delta %d clockid %d deadline %s\n",
+	pr_debug("delta %d clockid %d offload %s deadline %s\n",
 		 qopt->delta, qopt->clockid,
+		 OFFLOAD_IS_ON(qopt) ? "on" : "off",
 		 DEADLINE_MODE_IS_ON(qopt) ? "on" : "off");
 
 	err = validate_input_params(qopt, extack);
@@ -261,9 +317,16 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt,
 
 	q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
 
+	if (OFFLOAD_IS_ON(qopt)) {
+		err = etf_enable_offload(dev, q, extack);
+		if (err < 0)
+			return err;
+	}
+
 	/* Everything went OK, save the parameters used. */
 	q->delta = qopt->delta;
 	q->clockid = qopt->clockid;
+	q->offload = OFFLOAD_IS_ON(qopt);
 	q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
 
 	switch (q->clockid) {
@@ -326,10 +389,13 @@ static void etf_reset(struct Qdisc *sch)
 static void etf_destroy(struct Qdisc *sch)
 {
 	struct etf_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 
 	/* Only cancel watchdog if it's been initialized. */
 	if (q->watchdog.qdisc == sch)
 		qdisc_watchdog_cancel(&q->watchdog);
+
+	etf_disable_offload(dev, q);
 }
 
 static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -344,6 +410,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
 
 	opt.delta = q->delta;
 	opt.clockid = q->clockid;
+	if (q->offload)
+		opt.flags |= TC_ETF_OFFLOAD_ON;
+
 	if (q->deadline_mode)
 		opt.flags |= TC_ETF_DEADLINE_MODE_ON;
 
-- 
cgit v1.2.3


From 4b15c7075352668d4467ced7594b676707d11cae Mon Sep 17 00:00:00 2001
From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Date: Tue, 3 Jul 2018 15:43:00 -0700
Subject: net/sched: Make etf report drops on error_queue

Use the socket error queue for reporting dropped packets if the
socket has enabled that feature through the SO_TXTIME API.

Packets are dropped either on enqueue() if they aren't accepted by the
qdisc or on dequeue() if the system misses their deadline. Those are
reported as different errors so applications can react accordingly.

Userspace can retrieve the errors through the socket error queue and the
corresponding cmsg interfaces. A struct sock_extended_err* is used for
returning the error data, and the packet's timestamp can be retrieved by
adding both ee_data and ee_info fields as e.g.:

    ((__u64) serr->ee_data << 32) + serr->ee_info

This feature is disabled by default and must be explicitly enabled by
applications. Enabling it can bring some overhead for the Tx cycles
of the application.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h              |  3 ++-
 include/uapi/linux/errqueue.h   |  4 ++++
 include/uapi/linux/net_tstamp.h |  5 ++++-
 net/core/sock.c                 |  4 ++++
 net/sched/sch_etf.c             | 35 +++++++++++++++++++++++++++++++++--
 5 files changed, 47 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/sock.h b/include/net/sock.h
index 68347b9821c6..e0eac9ef44b5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -481,7 +481,8 @@ struct sock {
 
 	u8			sk_clockid;
 	u8			sk_txtime_deadline_mode : 1,
-				sk_txtime_unused : 7;
+				sk_txtime_report_errors : 1,
+				sk_txtime_unused : 6;
 
 	struct socket		*sk_socket;
 	void			*sk_user_data;
diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h
index dc64cfaf13da..c0151200f7d1 100644
--- a/include/uapi/linux/errqueue.h
+++ b/include/uapi/linux/errqueue.h
@@ -20,12 +20,16 @@ struct sock_extended_err {
 #define SO_EE_ORIGIN_ICMP6	3
 #define SO_EE_ORIGIN_TXSTATUS	4
 #define SO_EE_ORIGIN_ZEROCOPY	5
+#define SO_EE_ORIGIN_TXTIME	6
 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
 
 #define SO_EE_OFFENDER(ee)	((struct sockaddr*)((ee)+1))
 
 #define SO_EE_CODE_ZEROCOPY_COPIED	1
 
+#define SO_EE_CODE_TXTIME_INVALID_PARAM	1
+#define SO_EE_CODE_TXTIME_MISSED	2
+
 /**
  *	struct scm_timestamping - timestamps exposed through cmsg
  *
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index c9a77c353b98..f8f4539f1135 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -147,8 +147,11 @@ struct scm_ts_pktinfo {
  */
 enum txtime_flags {
 	SOF_TXTIME_DEADLINE_MODE = (1 << 0),
+	SOF_TXTIME_REPORT_ERRORS = (1 << 1),
 
-	SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_DEADLINE_MODE)
+	SOF_TXTIME_FLAGS_LAST = SOF_TXTIME_REPORT_ERRORS,
+	SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_FLAGS_LAST - 1) |
+				 SOF_TXTIME_FLAGS_LAST
 };
 
 struct sock_txtime {
diff --git a/net/core/sock.c b/net/core/sock.c
index fe64b839f1b2..03fdea5b0f57 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1087,6 +1087,8 @@ set_rcvbuf:
 			sk->sk_clockid = sk_txtime.clockid;
 			sk->sk_txtime_deadline_mode =
 				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+			sk->sk_txtime_report_errors =
+				!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
 		}
 		break;
 
@@ -1429,6 +1431,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.txtime.clockid = sk->sk_clockid;
 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
 				  SOF_TXTIME_DEADLINE_MODE : 0;
+		v.txtime.flags |= sk->sk_txtime_report_errors ?
+				  SOF_TXTIME_REPORT_ERRORS : 0;
 		break;
 
 	default:
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index 932a136db568..1538d6fa8165 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
+#include <linux/errqueue.h>
 #include <linux/rbtree.h>
 #include <linux/skbuff.h>
 #include <linux/posix-timers.h>
@@ -123,6 +124,32 @@ static void reset_watchdog(struct Qdisc *sch)
 	qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
 }
 
+static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
+{
+	struct sock_exterr_skb *serr;
+	struct sk_buff *clone;
+	ktime_t txtime = skb->tstamp;
+
+	if (!skb->sk || !(skb->sk->sk_txtime_report_errors))
+		return;
+
+	clone = skb_clone(skb, GFP_ATOMIC);
+	if (!clone)
+		return;
+
+	serr = SKB_EXT_ERR(clone);
+	serr->ee.ee_errno = err;
+	serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME;
+	serr->ee.ee_type = 0;
+	serr->ee.ee_code = code;
+	serr->ee.ee_pad = 0;
+	serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */
+	serr->ee.ee_info = txtime; /* low part of tstamp */
+
+	if (sock_queue_err_skb(skb->sk, clone))
+		kfree_skb(clone);
+}
+
 static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
 				      struct sk_buff **to_free)
 {
@@ -130,8 +157,11 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
 	struct rb_node **p = &q->head.rb_node, *parent = NULL;
 	ktime_t txtime = nskb->tstamp;
 
-	if (!is_packet_valid(sch, nskb))
+	if (!is_packet_valid(sch, nskb)) {
+		report_sock_error(nskb, EINVAL,
+				  SO_EE_CODE_TXTIME_INVALID_PARAM);
 		return qdisc_drop(nskb, sch, to_free);
+	}
 
 	while (*p) {
 		struct sk_buff *skb;
@@ -174,6 +204,8 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
 	if (drop) {
 		struct sk_buff *to_free = NULL;
 
+		report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
+
 		qdisc_drop(skb, sch, &to_free);
 		kfree_skb_list(to_free);
 		qdisc_qstats_overlimit(sch);
@@ -199,7 +231,6 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
 	now = q->get_time();
 
 	/* Drop if packet has expired while in queue. */
-	/* FIXME: Must return error on the socket's error queue */
 	if (ktime_before(skb->tstamp, now)) {
 		timesortedlist_erase(sch, skb, true);
 		skb = NULL;
-- 
cgit v1.2.3


From 2a75364d09b05f257f4cd1f718e06e0247eb1dd3 Mon Sep 17 00:00:00 2001
From: Keiichi Watanabe <keiichiw@chromium.org>
Date: Mon, 18 Jun 2018 03:58:53 -0400
Subject: media: v4l2-ctrl: Add control for VP9 profile

Add a new control V4L2_CID_MPEG_VIDEO_VP9_PROFILE for VP9 profiles. This control
allows selecting the desired profile for VP9 encoder and querying for supported
profiles by VP9 encoder/decoder.

Though this control is similar to V4L2_CID_MPEG_VIDEO_VP8_PROFILE, we need to
separate this control from it because supported profiles usually differ between
VP8 and VP9.

Signed-off-by: Keiichi Watanabe <keiichiw@chromium.org>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/extended-controls.rst | 23 ++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ctrls.c               | 11 +++++++++++
 include/uapi/linux/v4l2-controls.h                 |  7 +++++++
 3 files changed, 41 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst
index 01ef31a934b4..9f7312bf3365 100644
--- a/Documentation/media/uapi/v4l/extended-controls.rst
+++ b/Documentation/media/uapi/v4l/extended-controls.rst
@@ -1978,6 +1978,29 @@ enum v4l2_mpeg_video_vp8_profile -
     * - ``V4L2_MPEG_VIDEO_VP8_PROFILE_3``
       - Profile 3
 
+.. _v4l2-mpeg-video-vp9-profile:
+
+``V4L2_CID_MPEG_VIDEO_VP9_PROFILE``
+    (enum)
+
+enum v4l2_mpeg_video_vp9_profile -
+    This control allows selecting the profile for VP9 encoder.
+    This is also used to enumerate supported profiles by VP9 encoder or decoder.
+    Possible values are:
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+
+    * - ``V4L2_MPEG_VIDEO_VP9_PROFILE_0``
+      - Profile 0
+    * - ``V4L2_MPEG_VIDEO_VP9_PROFILE_1``
+      - Profile 1
+    * - ``V4L2_MPEG_VIDEO_VP9_PROFILE_2``
+      - Profile 2
+    * - ``V4L2_MPEG_VIDEO_VP9_PROFILE_3``
+      - Profile 3
+
 
 High Efficiency Video Coding (HEVC/H.265) Control Reference
 -----------------------------------------------------------
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 4a182a70de36..599c1cbff3b9 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -438,6 +438,13 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
 		"3",
 		NULL,
 	};
+	static const char * const vp9_profile[] = {
+		"0",
+		"1",
+		"2",
+		"3",
+		NULL,
+	};
 
 	static const char * const flash_led_mode[] = {
 		"Off",
@@ -623,6 +630,8 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
 		return vpx_golden_frame_sel;
 	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:
 		return vp8_profile;
+	case V4L2_CID_MPEG_VIDEO_VP9_PROFILE:
+		return vp9_profile;
 	case V4L2_CID_JPEG_CHROMA_SUBSAMPLING:
 		return jpeg_chroma_subsampling;
 	case V4L2_CID_DV_TX_MODE:
@@ -849,6 +858,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_VPX_I_FRAME_QP:		return "VPX I-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_VPX_P_FRAME_QP:		return "VPX P-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:			return "VP8 Profile";
+	case V4L2_CID_MPEG_VIDEO_VP9_PROFILE:			return "VP9 Profile";
 
 	/* HEVC controls */
 	case V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_QP:		return "HEVC I-Frame QP Value";
@@ -1190,6 +1200,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_TUNE_DEEMPHASIS:
 	case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL:
 	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:
+	case V4L2_CID_MPEG_VIDEO_VP9_PROFILE:
 	case V4L2_CID_DETECT_MD_MODE:
 	case V4L2_CID_MPEG_VIDEO_HEVC_PROFILE:
 	case V4L2_CID_MPEG_VIDEO_HEVC_LEVEL:
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index ab96795b2829..e4ee10ee917d 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -597,6 +597,13 @@ enum v4l2_mpeg_video_vp8_profile {
 };
 /* Deprecated alias for compatibility reasons. */
 #define V4L2_CID_MPEG_VIDEO_VPX_PROFILE	V4L2_CID_MPEG_VIDEO_VP8_PROFILE
+#define V4L2_CID_MPEG_VIDEO_VP9_PROFILE			(V4L2_CID_MPEG_BASE+512)
+enum v4l2_mpeg_video_vp9_profile {
+	V4L2_MPEG_VIDEO_VP9_PROFILE_0				= 0,
+	V4L2_MPEG_VIDEO_VP9_PROFILE_1				= 1,
+	V4L2_MPEG_VIDEO_VP9_PROFILE_2				= 2,
+	V4L2_MPEG_VIDEO_VP9_PROFILE_3				= 3,
+};
 
 /* CIDs for HEVC encoding. */
 
-- 
cgit v1.2.3


From eabaef1896bc06319461a644e3aa139885454def Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Wed, 4 Jul 2018 14:30:28 +0300
Subject: devlink: Add devlink_param register and unregister

Define configuration parameters data structure.
Add functions to register and unregister the driver supported
configuration parameters table.
For each parameter registered, the driver should fill all the parameter's
fields. In case the only supported configuration mode is "driverinit"
the parameter's get()/set() functions are not required and should be set
to NULL, for any other configuration mode, these functions are required
and should be set by the driver.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  85 +++++++++++++++++++++++++
 include/uapi/linux/devlink.h |  10 +++
 net/core/devlink.c           | 148 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 243 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index e336ea9c73df..4a0687a1fb99 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -27,6 +27,7 @@ struct devlink {
 	struct list_head sb_list;
 	struct list_head dpipe_table_list;
 	struct list_head resource_list;
+	struct list_head param_list;
 	struct devlink_dpipe_headers *dpipe_headers;
 	const struct devlink_ops *ops;
 	struct device *dev;
@@ -295,6 +296,68 @@ struct devlink_resource {
 
 #define DEVLINK_RESOURCE_ID_PARENT_TOP 0
 
+#define DEVLINK_PARAM_MAX_STRING_VALUE 32
+enum devlink_param_type {
+	DEVLINK_PARAM_TYPE_U8,
+	DEVLINK_PARAM_TYPE_U16,
+	DEVLINK_PARAM_TYPE_U32,
+	DEVLINK_PARAM_TYPE_STRING,
+	DEVLINK_PARAM_TYPE_BOOL,
+};
+
+union devlink_param_value {
+	u8 vu8;
+	u16 vu16;
+	u32 vu32;
+	const char *vstr;
+	bool vbool;
+};
+
+struct devlink_param_gset_ctx {
+	union devlink_param_value val;
+	enum devlink_param_cmode cmode;
+};
+
+/**
+ * struct devlink_param - devlink configuration parameter data
+ * @name: name of the parameter
+ * @generic: indicates if the parameter is generic or driver specific
+ * @type: parameter type
+ * @supported_cmodes: bitmap of supported configuration modes
+ * @get: get parameter value, used for runtime and permanent
+ *       configuration modes
+ * @set: set parameter value, used for runtime and permanent
+ *       configuration modes
+ *
+ * This struct should be used by the driver to fill the data for
+ * a parameter it registers.
+ */
+struct devlink_param {
+	u32 id;
+	const char *name;
+	bool generic;
+	enum devlink_param_type type;
+	unsigned long supported_cmodes;
+	int (*get)(struct devlink *devlink, u32 id,
+		   struct devlink_param_gset_ctx *ctx);
+	int (*set)(struct devlink *devlink, u32 id,
+		   struct devlink_param_gset_ctx *ctx);
+};
+
+struct devlink_param_item {
+	struct list_head list;
+	const struct devlink_param *param;
+	union devlink_param_value driverinit_value;
+	bool driverinit_value_valid;
+};
+
+enum devlink_param_generic_id {
+
+	/* add new param generic ids above here*/
+	__DEVLINK_PARAM_GENERIC_ID_MAX,
+	DEVLINK_PARAM_GENERIC_ID_MAX = __DEVLINK_PARAM_GENERIC_ID_MAX - 1,
+};
+
 struct devlink_ops {
 	int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack);
 	int (*port_type_set)(struct devlink_port *devlink_port,
@@ -430,6 +493,12 @@ void devlink_resource_occ_get_register(struct devlink *devlink,
 				       void *occ_get_priv);
 void devlink_resource_occ_get_unregister(struct devlink *devlink,
 					 u64 resource_id);
+int devlink_params_register(struct devlink *devlink,
+			    const struct devlink_param *params,
+			    size_t params_count);
+void devlink_params_unregister(struct devlink *devlink,
+			       const struct devlink_param *params,
+			       size_t params_count);
 
 #else
 
@@ -622,6 +691,22 @@ devlink_resource_occ_get_unregister(struct devlink *devlink,
 {
 }
 
+static inline int
+devlink_params_register(struct devlink *devlink,
+			const struct devlink_param *params,
+			size_t params_count)
+{
+	return 0;
+}
+
+static inline void
+devlink_params_unregister(struct devlink *devlink,
+			  const struct devlink_param *params,
+			  size_t params_count)
+{
+
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 75cb5450c851..d814fa67c7b9 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -142,6 +142,16 @@ enum devlink_port_flavour {
 				   */
 };
 
+enum devlink_param_cmode {
+	DEVLINK_PARAM_CMODE_RUNTIME,
+	DEVLINK_PARAM_CMODE_DRIVERINIT,
+	DEVLINK_PARAM_CMODE_PERMANENT,
+
+	/* Add new configuration modes above */
+	__DEVLINK_PARAM_CMODE_MAX,
+	DEVLINK_PARAM_CMODE_MAX = __DEVLINK_PARAM_CMODE_MAX - 1
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 22099705cc41..41b1a5d1c992 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2604,6 +2604,82 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 	return devlink->ops->reload(devlink, info->extack);
 }
 
+static const struct devlink_param devlink_param_generic[] = {};
+
+static int devlink_param_generic_verify(const struct devlink_param *param)
+{
+	/* verify it match generic parameter by id and name */
+	if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX)
+		return -EINVAL;
+	if (strcmp(param->name, devlink_param_generic[param->id].name))
+		return -ENOENT;
+
+	WARN_ON(param->type != devlink_param_generic[param->id].type);
+
+	return 0;
+}
+
+static int devlink_param_driver_verify(const struct devlink_param *param)
+{
+	int i;
+
+	if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX)
+		return -EINVAL;
+	/* verify no such name in generic params */
+	for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++)
+		if (!strcmp(param->name, devlink_param_generic[i].name))
+			return -EEXIST;
+
+	return 0;
+}
+
+static struct devlink_param_item *
+devlink_param_find_by_name(struct list_head *param_list,
+			   const char *param_name)
+{
+	struct devlink_param_item *param_item;
+
+	list_for_each_entry(param_item, param_list, list)
+		if (!strcmp(param_item->param->name, param_name))
+			return param_item;
+	return NULL;
+}
+
+static int devlink_param_register_one(struct devlink *devlink,
+				      const struct devlink_param *param)
+{
+	struct devlink_param_item *param_item;
+
+	if (devlink_param_find_by_name(&devlink->param_list,
+				       param->name))
+		return -EEXIST;
+
+	if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT))
+		WARN_ON(param->get || param->set);
+	else
+		WARN_ON(!param->get || !param->set);
+
+	param_item = kzalloc(sizeof(*param_item), GFP_KERNEL);
+	if (!param_item)
+		return -ENOMEM;
+	param_item->param = param;
+
+	list_add_tail(&param_item->list, &devlink->param_list);
+	return 0;
+}
+
+static void devlink_param_unregister_one(struct devlink *devlink,
+					 const struct devlink_param *param)
+{
+	struct devlink_param_item *param_item;
+
+	param_item = devlink_param_find_by_name(&devlink->param_list,
+						param->name);
+	WARN_ON(!param_item);
+	list_del(&param_item->list);
+	kfree(param_item);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2845,6 +2921,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
 	INIT_LIST_HEAD(&devlink->resource_list);
+	INIT_LIST_HEAD(&devlink->param_list);
 	mutex_init(&devlink->lock);
 	return devlink;
 }
@@ -3434,6 +3511,77 @@ out:
 }
 EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister);
 
+/**
+ *	devlink_params_register - register configuration parameters
+ *
+ *	@devlink: devlink
+ *	@params: configuration parameters array
+ *	@params_count: number of parameters provided
+ *
+ *	Register the configuration parameters supported by the driver.
+ */
+int devlink_params_register(struct devlink *devlink,
+			    const struct devlink_param *params,
+			    size_t params_count)
+{
+	const struct devlink_param *param = params;
+	int i;
+	int err;
+
+	mutex_lock(&devlink->lock);
+	for (i = 0; i < params_count; i++, param++) {
+		if (!param || !param->name || !param->supported_cmodes) {
+			err = -EINVAL;
+			goto rollback;
+		}
+		if (param->generic) {
+			err = devlink_param_generic_verify(param);
+			if (err)
+				goto rollback;
+		} else {
+			err = devlink_param_driver_verify(param);
+			if (err)
+				goto rollback;
+		}
+		err = devlink_param_register_one(devlink, param);
+		if (err)
+			goto rollback;
+	}
+
+	mutex_unlock(&devlink->lock);
+	return 0;
+
+rollback:
+	if (!i)
+		goto unlock;
+	for (param--; i > 0; i--, param--)
+		devlink_param_unregister_one(devlink, param);
+unlock:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_params_register);
+
+/**
+ *	devlink_params_unregister - unregister configuration parameters
+ *	@devlink: devlink
+ *	@params: configuration parameters to unregister
+ *	@params_count: number of parameters provided
+ */
+void devlink_params_unregister(struct devlink *devlink,
+			       const struct devlink_param *params,
+			       size_t params_count)
+{
+	const struct devlink_param *param = params;
+	int i;
+
+	mutex_lock(&devlink->lock);
+	for (i = 0; i < params_count; i++, param++)
+		devlink_param_unregister_one(devlink, param);
+	mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_params_unregister);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
cgit v1.2.3


From 45f05def5c44c806f094709f1c9b03dcecdd54f0 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Wed, 4 Jul 2018 14:30:29 +0300
Subject: devlink: Add param get command

Add param get command which gets data per parameter.
Option to dump the parameters data per device.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  11 ++
 net/core/devlink.c           | 250 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 261 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index d814fa67c7b9..2ccfe84176bf 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -78,6 +78,8 @@ enum devlink_command {
 	 */
 	DEVLINK_CMD_RELOAD,
 
+	DEVLINK_CMD_PARAM_GET,		/* can dump */
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -248,6 +250,15 @@ enum devlink_attr {
 	DEVLINK_ATTR_PORT_NUMBER,		/* u32 */
 	DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,	/* u32 */
 
+	DEVLINK_ATTR_PARAM,			/* nested */
+	DEVLINK_ATTR_PARAM_NAME,		/* string */
+	DEVLINK_ATTR_PARAM_GENERIC,		/* flag */
+	DEVLINK_ATTR_PARAM_TYPE,		/* u8 */
+	DEVLINK_ATTR_PARAM_VALUES_LIST,		/* nested */
+	DEVLINK_ATTR_PARAM_VALUE,		/* nested */
+	DEVLINK_ATTR_PARAM_VALUE_DATA,		/* dynamic */
+	DEVLINK_ATTR_PARAM_VALUE_CMODE,		/* u8 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 41b1a5d1c992..b22d41275f0b 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2645,6 +2645,248 @@ devlink_param_find_by_name(struct list_head *param_list,
 	return NULL;
 }
 
+static bool
+devlink_param_cmode_is_supported(const struct devlink_param *param,
+				 enum devlink_param_cmode cmode)
+{
+	return test_bit(cmode, &param->supported_cmodes);
+}
+
+static int devlink_param_get(struct devlink *devlink,
+			     const struct devlink_param *param,
+			     struct devlink_param_gset_ctx *ctx)
+{
+	if (!param->get)
+		return -EOPNOTSUPP;
+	return param->get(devlink, param->id, ctx);
+}
+
+static int
+devlink_param_type_to_nla_type(enum devlink_param_type param_type)
+{
+	switch (param_type) {
+	case DEVLINK_PARAM_TYPE_U8:
+		return NLA_U8;
+	case DEVLINK_PARAM_TYPE_U16:
+		return NLA_U16;
+	case DEVLINK_PARAM_TYPE_U32:
+		return NLA_U32;
+	case DEVLINK_PARAM_TYPE_STRING:
+		return NLA_STRING;
+	case DEVLINK_PARAM_TYPE_BOOL:
+		return NLA_FLAG;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int
+devlink_nl_param_value_fill_one(struct sk_buff *msg,
+				enum devlink_param_type type,
+				enum devlink_param_cmode cmode,
+				union devlink_param_value val)
+{
+	struct nlattr *param_value_attr;
+
+	param_value_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUE);
+	if (!param_value_attr)
+		goto nla_put_failure;
+
+	if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode))
+		goto value_nest_cancel;
+
+	switch (type) {
+	case DEVLINK_PARAM_TYPE_U8:
+		if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8))
+			goto value_nest_cancel;
+		break;
+	case DEVLINK_PARAM_TYPE_U16:
+		if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16))
+			goto value_nest_cancel;
+		break;
+	case DEVLINK_PARAM_TYPE_U32:
+		if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32))
+			goto value_nest_cancel;
+		break;
+	case DEVLINK_PARAM_TYPE_STRING:
+		if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA,
+				   val.vstr))
+			goto value_nest_cancel;
+		break;
+	case DEVLINK_PARAM_TYPE_BOOL:
+		if (val.vbool &&
+		    nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA))
+			goto value_nest_cancel;
+		break;
+	}
+
+	nla_nest_end(msg, param_value_attr);
+	return 0;
+
+value_nest_cancel:
+	nla_nest_cancel(msg, param_value_attr);
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
+				 struct devlink_param_item *param_item,
+				 enum devlink_command cmd,
+				 u32 portid, u32 seq, int flags)
+{
+	union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1];
+	const struct devlink_param *param = param_item->param;
+	struct devlink_param_gset_ctx ctx;
+	struct nlattr *param_values_list;
+	struct nlattr *param_attr;
+	int nla_type;
+	void *hdr;
+	int err;
+	int i;
+
+	/* Get value from driver part to driverinit configuration mode */
+	for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+		if (!devlink_param_cmode_is_supported(param, i))
+			continue;
+		if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+			if (!param_item->driverinit_value_valid)
+				return -EOPNOTSUPP;
+			param_value[i] = param_item->driverinit_value;
+		} else {
+			ctx.cmode = i;
+			err = devlink_param_get(devlink, param, &ctx);
+			if (err)
+				return err;
+			param_value[i] = ctx.val;
+		}
+	}
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto genlmsg_cancel;
+	param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM);
+	if (!param_attr)
+		goto genlmsg_cancel;
+	if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name))
+		goto param_nest_cancel;
+	if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC))
+		goto param_nest_cancel;
+
+	nla_type = devlink_param_type_to_nla_type(param->type);
+	if (nla_type < 0)
+		goto param_nest_cancel;
+	if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type))
+		goto param_nest_cancel;
+
+	param_values_list = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUES_LIST);
+	if (!param_values_list)
+		goto param_nest_cancel;
+
+	for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+		if (!devlink_param_cmode_is_supported(param, i))
+			continue;
+		err = devlink_nl_param_value_fill_one(msg, param->type,
+						      i, param_value[i]);
+		if (err)
+			goto values_list_nest_cancel;
+	}
+
+	nla_nest_end(msg, param_values_list);
+	nla_nest_end(msg, param_attr);
+	genlmsg_end(msg, hdr);
+	return 0;
+
+values_list_nest_cancel:
+	nla_nest_end(msg, param_values_list);
+param_nest_cancel:
+	nla_nest_cancel(msg, param_attr);
+genlmsg_cancel:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
+					   struct netlink_callback *cb)
+{
+	struct devlink_param_item *param_item;
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		mutex_lock(&devlink->lock);
+		list_for_each_entry(param_item, &devlink->param_list, list) {
+			if (idx < start) {
+				idx++;
+				continue;
+			}
+			err = devlink_nl_param_fill(msg, devlink, param_item,
+						    DEVLINK_CMD_PARAM_GET,
+						    NETLINK_CB(cb->skb).portid,
+						    cb->nlh->nlmsg_seq,
+						    NLM_F_MULTI);
+			if (err) {
+				mutex_unlock(&devlink->lock);
+				goto out;
+			}
+			idx++;
+		}
+		mutex_unlock(&devlink->lock);
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static struct devlink_param_item *
+devlink_param_get_from_info(struct devlink *devlink,
+			    struct genl_info *info)
+{
+	char *param_name;
+
+	if (!info->attrs[DEVLINK_ATTR_PARAM_NAME])
+		return NULL;
+
+	param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
+	return devlink_param_find_by_name(&devlink->param_list, param_name);
+}
+
+static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
+					 struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_param_item *param_item;
+	struct sk_buff *msg;
+	int err;
+
+	param_item = devlink_param_get_from_info(devlink, info);
+	if (!param_item)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_param_fill(msg, devlink, param_item,
+				    DEVLINK_CMD_PARAM_GET,
+				    info->snd_portid, info->snd_seq, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
 static int devlink_param_register_one(struct devlink *devlink,
 				      const struct devlink_param *param)
 {
@@ -2883,6 +3125,14 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
 				  DEVLINK_NL_FLAG_NO_LOCK,
 	},
+	{
+		.cmd = DEVLINK_CMD_PARAM_GET,
+		.doit = devlink_nl_cmd_param_get_doit,
+		.dumpit = devlink_nl_cmd_param_get_dumpit,
+		.policy = devlink_nl_policy,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		/* can be retrieved by unprivileged users */
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3


From e3b7ca18ad7b2f47ebd3b6e6ce58a42c6ec24746 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Wed, 4 Jul 2018 14:30:30 +0300
Subject: devlink: Add param set command

Add param set command to set value for a parameter.
Value can be set to any of the supported configuration modes.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |   4 ++
 include/uapi/linux/devlink.h |   1 +
 net/core/devlink.c           | 134 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 139 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4a0687a1fb99..88062752dcd7 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -328,6 +328,7 @@ struct devlink_param_gset_ctx {
  *       configuration modes
  * @set: set parameter value, used for runtime and permanent
  *       configuration modes
+ * @validate: validate input value is applicable (within value range, etc.)
  *
  * This struct should be used by the driver to fill the data for
  * a parameter it registers.
@@ -342,6 +343,9 @@ struct devlink_param {
 		   struct devlink_param_gset_ctx *ctx);
 	int (*set)(struct devlink *devlink, u32 id,
 		   struct devlink_param_gset_ctx *ctx);
+	int (*validate)(struct devlink *devlink, u32 id,
+			union devlink_param_value val,
+			struct netlink_ext_ack *extack);
 };
 
 struct devlink_param_item {
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 2ccfe84176bf..ea0623e568f0 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -79,6 +79,7 @@ enum devlink_command {
 	DEVLINK_CMD_RELOAD,
 
 	DEVLINK_CMD_PARAM_GET,		/* can dump */
+	DEVLINK_CMD_PARAM_SET,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b22d41275f0b..0cd7a42dcec2 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2661,6 +2661,15 @@ static int devlink_param_get(struct devlink *devlink,
 	return param->get(devlink, param->id, ctx);
 }
 
+static int devlink_param_set(struct devlink *devlink,
+			     const struct devlink_param *param,
+			     struct devlink_param_gset_ctx *ctx)
+{
+	if (!param->set)
+		return -EOPNOTSUPP;
+	return param->set(devlink, param->id, ctx);
+}
+
 static int
 devlink_param_type_to_nla_type(enum devlink_param_type param_type)
 {
@@ -2847,6 +2856,69 @@ out:
 	return msg->len;
 }
 
+static int
+devlink_param_type_get_from_info(struct genl_info *info,
+				 enum devlink_param_type *param_type)
+{
+	if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE])
+		return -EINVAL;
+
+	switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) {
+	case NLA_U8:
+		*param_type = DEVLINK_PARAM_TYPE_U8;
+		break;
+	case NLA_U16:
+		*param_type = DEVLINK_PARAM_TYPE_U16;
+		break;
+	case NLA_U32:
+		*param_type = DEVLINK_PARAM_TYPE_U32;
+		break;
+	case NLA_STRING:
+		*param_type = DEVLINK_PARAM_TYPE_STRING;
+		break;
+	case NLA_FLAG:
+		*param_type = DEVLINK_PARAM_TYPE_BOOL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+devlink_param_value_get_from_info(const struct devlink_param *param,
+				  struct genl_info *info,
+				  union devlink_param_value *value)
+{
+	if (param->type != DEVLINK_PARAM_TYPE_BOOL &&
+	    !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])
+		return -EINVAL;
+
+	switch (param->type) {
+	case DEVLINK_PARAM_TYPE_U8:
+		value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+		break;
+	case DEVLINK_PARAM_TYPE_U16:
+		value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+		break;
+	case DEVLINK_PARAM_TYPE_U32:
+		value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+		break;
+	case DEVLINK_PARAM_TYPE_STRING:
+		if (nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) >
+		    DEVLINK_PARAM_MAX_STRING_VALUE)
+			return -EINVAL;
+		value->vstr = nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+		break;
+	case DEVLINK_PARAM_TYPE_BOOL:
+		value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ?
+			       true : false;
+		break;
+	}
+	return 0;
+}
+
 static struct devlink_param_item *
 devlink_param_get_from_info(struct devlink *devlink,
 			    struct genl_info *info)
@@ -2887,6 +2959,58 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
 	return genlmsg_reply(msg, info);
 }
 
+static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
+					 struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	enum devlink_param_type param_type;
+	struct devlink_param_gset_ctx ctx;
+	enum devlink_param_cmode cmode;
+	struct devlink_param_item *param_item;
+	const struct devlink_param *param;
+	union devlink_param_value value;
+	int err = 0;
+
+	param_item = devlink_param_get_from_info(devlink, info);
+	if (!param_item)
+		return -EINVAL;
+	param = param_item->param;
+	err = devlink_param_type_get_from_info(info, &param_type);
+	if (err)
+		return err;
+	if (param_type != param->type)
+		return -EINVAL;
+	err = devlink_param_value_get_from_info(param, info, &value);
+	if (err)
+		return err;
+	if (param->validate) {
+		err = param->validate(devlink, param->id, value, info->extack);
+		if (err)
+			return err;
+	}
+
+	if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE])
+		return -EINVAL;
+	cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]);
+	if (!devlink_param_cmode_is_supported(param, cmode))
+		return -EOPNOTSUPP;
+
+	if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+		param_item->driverinit_value = value;
+		param_item->driverinit_value_valid = true;
+	} else {
+		if (!param->set)
+			return -EOPNOTSUPP;
+		ctx.val = value;
+		ctx.cmode = cmode;
+		err = devlink_param_set(devlink, param, &ctx);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int devlink_param_register_one(struct devlink *devlink,
 				      const struct devlink_param *param)
 {
@@ -2942,6 +3066,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64},
 	[DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64},
+	[DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 },
+	[DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -3133,6 +3260,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 		/* can be retrieved by unprivileged users */
 	},
+	{
+		.cmd = DEVLINK_CMD_PARAM_SET,
+		.doit = devlink_nl_cmd_param_set_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3


From ea601e17098856ee059f35c2a75659e57df81f25 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Wed, 4 Jul 2018 14:30:32 +0300
Subject: devlink: Add devlink notifications support for params

Add devlink_param_notify() function to support devlink param notifications.
Add notification call to devlink param set, register and unregister
functions.
Add devlink_param_value_changed() function to enable the driver notify
devlink on value change. Driver should use this function after value was
changed on any configuration mode part to driverinit.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  7 +++++++
 include/uapi/linux/devlink.h |  2 ++
 net/core/devlink.c           | 50 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 3302e43b09a4..792edaa996ba 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -507,6 +507,7 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
 				       union devlink_param_value *init_val);
 int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
 				       union devlink_param_value init_val);
+void devlink_param_value_changed(struct devlink *devlink, u32 param_id);
 
 #else
 
@@ -729,6 +730,12 @@ devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
 	return -EOPNOTSUPP;
 }
 
+static inline void
+devlink_param_value_changed(struct devlink *devlink, u32 param_id)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index ea0623e568f0..68641fb56654 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -80,6 +80,8 @@ enum devlink_command {
 
 	DEVLINK_CMD_PARAM_GET,		/* can dump */
 	DEVLINK_CMD_PARAM_SET,
+	DEVLINK_CMD_PARAM_NEW,
+	DEVLINK_CMD_PARAM_DEL,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 3af08f4562b5..89d948fd4727 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2828,6 +2828,28 @@ genlmsg_cancel:
 	return -EMSGSIZE;
 }
 
+static void devlink_param_notify(struct devlink *devlink,
+				 struct devlink_param_item *param_item,
+				 enum devlink_command cmd)
+{
+	struct sk_buff *msg;
+	int err;
+
+	WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+	err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
 static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
 					   struct netlink_callback *cb)
 {
@@ -3019,6 +3041,7 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
 			return err;
 	}
 
+	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
 	return 0;
 }
 
@@ -3042,6 +3065,7 @@ static int devlink_param_register_one(struct devlink *devlink,
 	param_item->param = param;
 
 	list_add_tail(&param_item->list, &devlink->param_list);
+	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
 	return 0;
 }
 
@@ -3053,6 +3077,7 @@ static void devlink_param_unregister_one(struct devlink *devlink,
 	param_item = devlink_param_find_by_name(&devlink->param_list,
 						param->name);
 	WARN_ON(!param_item);
+	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL);
 	list_del(&param_item->list);
 	kfree(param_item);
 }
@@ -4039,10 +4064,35 @@ int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
 	param_item->driverinit_value = init_val;
 	param_item->driverinit_value_valid = true;
 
+	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set);
 
+/**
+ *	devlink_param_value_changed - notify devlink on a parameter's value
+ *				      change. Should be called by the driver
+ *				      right after the change.
+ *
+ *	@devlink: devlink
+ *	@param_id: parameter ID
+ *
+ *	This function should be used by the driver to notify devlink on value
+ *	change, excluding driverinit configuration mode.
+ *	For driverinit configuration mode driver should use the function
+ *	devlink_param_driverinit_value_set() instead.
+ */
+void devlink_param_value_changed(struct devlink *devlink, u32 param_id)
+{
+	struct devlink_param_item *param_item;
+
+	param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+	WARN_ON(!param_item);
+
+	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_param_value_changed);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
cgit v1.2.3


From d64efd0926ba4f32e657e615a4f4a6170d5cc0fa Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@mellanox.com>
Date: Fri, 6 Jul 2018 05:38:16 +0000
Subject: net/sched: flower: Add supprt for matching on QinQ vlan headers

As support dissecting of QinQ inner and outer vlan headers, user can
add rules to match on QinQ vlan headers.

Signed-off-by: Jianbo Liu <jianbol@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  4 +++
 net/sched/cls_flower.c       | 65 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 55 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 84e4c1d0f874..c4262d911596 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -469,6 +469,10 @@ enum {
 	TCA_FLOWER_KEY_IP_TTL,		/* u8 */
 	TCA_FLOWER_KEY_IP_TTL_MASK,	/* u8 */
 
+	TCA_FLOWER_KEY_CVLAN_ID,	/* be16 */
+	TCA_FLOWER_KEY_CVLAN_PRIO,	/* u8   */
+	TCA_FLOWER_KEY_CVLAN_ETH_TYPE,	/* be16 */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index e93b13d2cb81..487a152a852c 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -35,6 +35,7 @@ struct fl_flow_key {
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_eth_addrs eth;
 	struct flow_dissector_key_vlan vlan;
+	struct flow_dissector_key_vlan cvlan;
 	union {
 		struct flow_dissector_key_ipv4_addrs ipv4;
 		struct flow_dissector_key_ipv6_addrs ipv6;
@@ -449,6 +450,9 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_IP_TOS_MASK]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_IP_TTL]		= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_IP_TTL_MASK]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_CVLAN_ID]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_CVLAN_PRIO]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_CVLAN_ETH_TYPE]	= { .type = NLA_U16 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -501,19 +505,20 @@ static int fl_set_key_mpls(struct nlattr **tb,
 
 static void fl_set_key_vlan(struct nlattr **tb,
 			    __be16 ethertype,
+			    int vlan_id_key, int vlan_prio_key,
 			    struct flow_dissector_key_vlan *key_val,
 			    struct flow_dissector_key_vlan *key_mask)
 {
 #define VLAN_PRIORITY_MASK	0x7
 
-	if (tb[TCA_FLOWER_KEY_VLAN_ID]) {
+	if (tb[vlan_id_key]) {
 		key_val->vlan_id =
-			nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK;
+			nla_get_u16(tb[vlan_id_key]) & VLAN_VID_MASK;
 		key_mask->vlan_id = VLAN_VID_MASK;
 	}
-	if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) {
+	if (tb[vlan_prio_key]) {
 		key_val->vlan_priority =
-			nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) &
+			nla_get_u8(tb[vlan_prio_key]) &
 			VLAN_PRIORITY_MASK;
 		key_mask->vlan_priority = VLAN_PRIORITY_MASK;
 	}
@@ -596,11 +601,25 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 		ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
 
 		if (eth_type_vlan(ethertype)) {
-			fl_set_key_vlan(tb, ethertype, &key->vlan, &mask->vlan);
-			fl_set_key_val(tb, &key->basic.n_proto,
-				       TCA_FLOWER_KEY_VLAN_ETH_TYPE,
-				       &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
-				       sizeof(key->basic.n_proto));
+			fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID,
+					TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan,
+					&mask->vlan);
+
+			ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]);
+			if (eth_type_vlan(ethertype)) {
+				fl_set_key_vlan(tb, ethertype,
+						TCA_FLOWER_KEY_CVLAN_ID,
+						TCA_FLOWER_KEY_CVLAN_PRIO,
+						&key->cvlan, &mask->cvlan);
+				fl_set_key_val(tb, &key->basic.n_proto,
+					       TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+					       &mask->basic.n_proto,
+					       TCA_FLOWER_UNSPEC,
+					       sizeof(key->basic.n_proto));
+			} else {
+				key->basic.n_proto = ethertype;
+				mask->basic.n_proto = cpu_to_be16(~0);
+			}
 		} else {
 			key->basic.n_proto = ethertype;
 			mask->basic.n_proto = cpu_to_be16(~0);
@@ -825,6 +844,8 @@ static void fl_init_dissector(struct fl_flow_mask *mask)
 			     FLOW_DISSECTOR_KEY_MPLS, mpls);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_VLAN, vlan);
+	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+			     FLOW_DISSECTOR_KEY_CVLAN, cvlan);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
@@ -1201,6 +1222,7 @@ static int fl_dump_key_ip(struct sk_buff *skb,
 }
 
 static int fl_dump_key_vlan(struct sk_buff *skb,
+			    int vlan_id_key, int vlan_prio_key,
 			    struct flow_dissector_key_vlan *vlan_key,
 			    struct flow_dissector_key_vlan *vlan_mask)
 {
@@ -1209,13 +1231,13 @@ static int fl_dump_key_vlan(struct sk_buff *skb,
 	if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask)))
 		return 0;
 	if (vlan_mask->vlan_id) {
-		err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID,
+		err = nla_put_u16(skb, vlan_id_key,
 				  vlan_key->vlan_id);
 		if (err)
 			return err;
 	}
 	if (vlan_mask->vlan_priority) {
-		err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO,
+		err = nla_put_u8(skb, vlan_prio_key,
 				 vlan_key->vlan_priority);
 		if (err)
 			return err;
@@ -1310,13 +1332,28 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
 	if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls))
 		goto nla_put_failure;
 
-	if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan))
+	if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_VLAN_ID,
+			     TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, &mask->vlan))
 		goto nla_put_failure;
 
-	if (mask->vlan.vlan_tpid &&
-	    nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, key->basic.n_proto))
+	if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_CVLAN_ID,
+			     TCA_FLOWER_KEY_CVLAN_PRIO,
+			     &key->cvlan, &mask->cvlan) ||
+	    (mask->cvlan.vlan_tpid &&
+	     nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+			 key->cvlan.vlan_tpid)))
 		goto nla_put_failure;
 
+	if (mask->cvlan.vlan_tpid) {
+		if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+				 key->basic.n_proto))
+			goto nla_put_failure;
+	} else if (mask->vlan.vlan_tpid) {
+		if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+				 key->basic.n_proto))
+			goto nla_put_failure;
+	}
+
 	if ((key->basic.n_proto == htons(ETH_P_IP) ||
 	     key->basic.n_proto == htons(ETH_P_IPV6)) &&
 	    (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
-- 
cgit v1.2.3


From 16777ecd1b54d75136f77b2cc25f2cfa75156852 Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@ens-lyon.org>
Date: Sun, 3 Jun 2018 20:18:58 +0200
Subject: kbd: complete dead keys definitions

This completes dead keys definitions for internationalization
completeness on the console.  The representatives have been chosen
coherently with libx11 compose sequences, which avoid symetry conflicts
(e.g. there is U with caron, but no c with breve).

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/s390/char/keyboard.c  | 30 ++++++++++++++++++++++++++++--
 drivers/tty/vt/keyboard.c     | 30 +++++++++++++++++++++++++++++-
 include/uapi/linux/keyboard.h | 23 ++++++++++++++++++++++-
 3 files changed, 79 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/s390/char/keyboard.c b/drivers/s390/char/keyboard.c
index 79eb60958015..eda245887fe0 100644
--- a/drivers/s390/char/keyboard.c
+++ b/drivers/s390/char/keyboard.c
@@ -39,8 +39,34 @@ static const int kbd_max_vals[] = {
 };
 static const int KBD_NR_TYPES = ARRAY_SIZE(kbd_max_vals);
 
-static unsigned char ret_diacr[NR_DEAD] = {
-	'`', '\'', '^', '~', '"', ','
+static const unsigned char ret_diacr[NR_DEAD] = {
+	'`',	/* dead_grave */
+	'\'',	/* dead_acute */
+	'^',	/* dead_circumflex */
+	'~',	/* dead_tilda */
+	'"',	/* dead_diaeresis */
+	',',	/* dead_cedilla */
+	'_',	/* dead_macron */
+	'U',	/* dead_breve */
+	'.',	/* dead_abovedot */
+	'*',	/* dead_abovering */
+	'=',	/* dead_doubleacute */
+	'c',	/* dead_caron */
+	'k',	/* dead_ogonek */
+	'i',	/* dead_iota */
+	'#',	/* dead_voiced_sound */
+	'o',	/* dead_semivoiced_sound */
+	'!',	/* dead_belowdot */
+	'?',	/* dead_hook */
+	'+',	/* dead_horn */
+	'-',	/* dead_stroke */
+	')',	/* dead_abovecomma */
+	'(',	/* dead_abovereversedcomma */
+	':',	/* dead_doublegrave */
+	'n',	/* dead_invertedbreve */
+	';',	/* dead_belowcomma */
+	'$',	/* dead_currency */
+	'@',	/* dead_greek */
 };
 
 /*
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index d5b4a2b44ab8..c0f5802acd7c 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -690,7 +690,35 @@ static void k_dead2(struct vc_data *vc, unsigned char value, char up_flag)
  */
 static void k_dead(struct vc_data *vc, unsigned char value, char up_flag)
 {
-	static const unsigned char ret_diacr[NR_DEAD] = {'`', '\'', '^', '~', '"', ',' };
+	static const unsigned char ret_diacr[NR_DEAD] = {
+		'`',	/* dead_grave */
+		'\'',	/* dead_acute */
+		'^',	/* dead_circumflex */
+		'~',	/* dead_tilda */
+		'"',	/* dead_diaeresis */
+		',',	/* dead_cedilla */
+		'_',	/* dead_macron */
+		'U',	/* dead_breve */
+		'.',	/* dead_abovedot */
+		'*',	/* dead_abovering */
+		'=',	/* dead_doubleacute */
+		'c',	/* dead_caron */
+		'k',	/* dead_ogonek */
+		'i',	/* dead_iota */
+		'#',	/* dead_voiced_sound */
+		'o',	/* dead_semivoiced_sound */
+		'!',	/* dead_belowdot */
+		'?',	/* dead_hook */
+		'+',	/* dead_horn */
+		'-',	/* dead_stroke */
+		')',	/* dead_abovecomma */
+		'(',	/* dead_abovereversedcomma */
+		':',	/* dead_doublegrave */
+		'n',	/* dead_invertedbreve */
+		';',	/* dead_belowcomma */
+		'$',	/* dead_currency */
+		'@',	/* dead_greek */
+	};
 
 	k_deadunicode(vc, ret_diacr[value], up_flag);
 }
diff --git a/include/uapi/linux/keyboard.h b/include/uapi/linux/keyboard.h
index ab4108c83186..4846716e7c5c 100644
--- a/include/uapi/linux/keyboard.h
+++ b/include/uapi/linux/keyboard.h
@@ -357,8 +357,29 @@
 #define K_DTILDE	K(KT_DEAD,3)
 #define K_DDIERE	K(KT_DEAD,4)
 #define K_DCEDIL	K(KT_DEAD,5)
+#define K_DMACRON	K(KT_DEAD,6)
+#define K_DBREVE	K(KT_DEAD,7)
+#define K_DABDOT	K(KT_DEAD,8)
+#define K_DABRING	K(KT_DEAD,9)
+#define K_DDBACUTE	K(KT_DEAD,10)
+#define K_DCARON	K(KT_DEAD,11)
+#define K_DOGONEK	K(KT_DEAD,12)
+#define K_DIOTA		K(KT_DEAD,13)
+#define K_DVOICED	K(KT_DEAD,14)
+#define K_DSEMVOICED	K(KT_DEAD,15)
+#define K_DBEDOT	K(KT_DEAD,16)
+#define K_DHOOK		K(KT_DEAD,17)
+#define K_DHORN		K(KT_DEAD,18)
+#define K_DSTROKE	K(KT_DEAD,19)
+#define K_DABCOMMA	K(KT_DEAD,20)
+#define K_DABREVCOMMA	K(KT_DEAD,21)
+#define K_DDBGRAVE	K(KT_DEAD,22)
+#define K_DINVBREVE	K(KT_DEAD,23)
+#define K_DBECOMMA	K(KT_DEAD,24)
+#define K_DCURRENCY	K(KT_DEAD,25)
+#define K_DGREEK	K(KT_DEAD,26)
 
-#define NR_DEAD		6
+#define NR_DEAD		27
 
 #define K_DOWN		K(KT_CUR,0)
 #define K_LEFT		K(KT_CUR,1)
-- 
cgit v1.2.3


From b233504033dbd65740e59681820ccfd0a2a8ec53 Mon Sep 17 00:00:00 2001
From: Yifeng Sun <pkusunyifeng@gmail.com>
Date: Mon, 2 Jul 2018 08:18:03 -0700
Subject: openvswitch: kernel datapath clone action

Add 'clone' action to kernel datapath by using existing functions.
When actions within clone don't modify the current flow, the flow
key is not cloned before executing clone actions.

This is a follow up patch for this incomplete work:
https://patchwork.ozlabs.org/patch/722096/

v1 -> v2:
Refactor as advised by reviewer.

Signed-off-by: Yifeng Sun <pkusunyifeng@gmail.com>
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/openvswitch.h      |  5 +++
 include/uapi/linux/openvswitch.h |  3 ++
 net/openvswitch/actions.c        | 33 ++++++++++++++++++
 net/openvswitch/flow_netlink.c   | 73 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 114 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
index e6b240b6196c..379affc63e24 100644
--- a/include/linux/openvswitch.h
+++ b/include/linux/openvswitch.h
@@ -21,4 +21,9 @@
 
 #include <uapi/linux/openvswitch.h>
 
+#define OVS_CLONE_ATTR_EXEC      0   /* Specify an u32 value. When nonzero,
+				      * actions in clone will not change flow
+				      * keys. False otherwise.
+				      */
+
 #endif /* _LINUX_OPENVSWITCH_H */
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 863aabaa5cc9..dbe0cbe4f1b7 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -840,6 +840,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
  * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the
  * packet, or modify the packet (e.g., change the DSCP field).
+ * @OVS_ACTION_ATTR_CLONE: make a copy of the packet and execute a list of
+ * actions without affecting the original packet and key.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -873,6 +875,7 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_PUSH_NSH,     /* Nested OVS_NSH_KEY_ATTR_*. */
 	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
 	OVS_ACTION_ATTR_METER,        /* u32 meter ID. */
+	OVS_ACTION_ATTR_CLONE,        /* Nested OVS_CLONE_ATTR_*.  */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 30a5df27116e..85ae53d8fd09 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1057,6 +1057,28 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
 			     clone_flow_key);
 }
 
+/* When 'last' is true, clone() should always consume the 'skb'.
+ * Otherwise, clone() should keep 'skb' intact regardless what
+ * actions are executed within clone().
+ */
+static int clone(struct datapath *dp, struct sk_buff *skb,
+		 struct sw_flow_key *key, const struct nlattr *attr,
+		 bool last)
+{
+	struct nlattr *actions;
+	struct nlattr *clone_arg;
+	int rem = nla_len(attr);
+	bool dont_clone_flow_key;
+
+	/* The first action is always 'OVS_CLONE_ATTR_ARG'. */
+	clone_arg = nla_data(attr);
+	dont_clone_flow_key = nla_get_u32(clone_arg);
+	actions = nla_next(clone_arg, &rem);
+
+	return clone_execute(dp, skb, key, 0, actions, rem, last,
+			     !dont_clone_flow_key);
+}
+
 static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
 			 const struct nlattr *attr)
 {
@@ -1336,6 +1358,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 				consume_skb(skb);
 				return 0;
 			}
+			break;
+
+		case OVS_ACTION_ATTR_CLONE: {
+			bool last = nla_is_last(a, rem);
+
+			err = clone(dp, skb, key, a, last);
+			if (last)
+				return err;
+
+			break;
+		}
 		}
 
 		if (unlikely(err)) {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 391c4073a6dc..a70097ecf33c 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2460,6 +2460,40 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
 	return 0;
 }
 
+static int validate_and_copy_clone(struct net *net,
+				   const struct nlattr *attr,
+				   const struct sw_flow_key *key,
+				   struct sw_flow_actions **sfa,
+				   __be16 eth_type, __be16 vlan_tci,
+				   bool log, bool last)
+{
+	int start, err;
+	u32 exec;
+
+	if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN)
+		return -EINVAL;
+
+	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log);
+	if (start < 0)
+		return start;
+
+	exec = last || !actions_may_change_flow(attr);
+
+	err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec,
+				 sizeof(exec), log);
+	if (err)
+		return err;
+
+	err = __ovs_nla_copy_actions(net, attr, key, sfa,
+				     eth_type, vlan_tci, log);
+	if (err)
+		return err;
+
+	add_nested_action_end(*sfa, start);
+
+	return 0;
+}
+
 void ovs_match_init(struct sw_flow_match *match,
 		    struct sw_flow_key *key,
 		    bool reset_key,
@@ -2849,6 +2883,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
 			[OVS_ACTION_ATTR_POP_NSH] = 0,
 			[OVS_ACTION_ATTR_METER] = sizeof(u32),
+			[OVS_ACTION_ATTR_CLONE] = (u32)-1,
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -3038,6 +3073,18 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			/* Non-existent meters are simply ignored.  */
 			break;
 
+		case OVS_ACTION_ATTR_CLONE: {
+			bool last = nla_is_last(a, rem);
+
+			err = validate_and_copy_clone(net, a, key, sfa,
+						      eth_type, vlan_tci,
+						      log, last);
+			if (err)
+				return err;
+			skip_copy = true;
+			break;
+		}
+
 		default:
 			OVS_NLERR(log, "Unknown Action type %d", type);
 			return -EINVAL;
@@ -3116,6 +3163,26 @@ out:
 	return err;
 }
 
+static int clone_action_to_attr(const struct nlattr *attr,
+				struct sk_buff *skb)
+{
+	struct nlattr *start;
+	int err = 0, rem = nla_len(attr);
+
+	start = nla_nest_start(skb, OVS_ACTION_ATTR_CLONE);
+	if (!start)
+		return -EMSGSIZE;
+
+	err = ovs_nla_put_actions(nla_data(attr), rem, skb);
+
+	if (err)
+		nla_nest_cancel(skb, start);
+	else
+		nla_nest_end(skb, start);
+
+	return err;
+}
+
 static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
 {
 	const struct nlattr *ovs_key = nla_data(a);
@@ -3204,6 +3271,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
 				return err;
 			break;
 
+		case OVS_ACTION_ATTR_CLONE:
+			err = clone_action_to_attr(a, skb);
+			if (err)
+				return err;
+			break;
+
 		default:
 			if (nla_put(skb, type, nla_len(a), nla_data(a)))
 				return -EMSGSIZE;
-- 
cgit v1.2.3


From b3e7e7d2d668de0102264302a4d10dd9d4438a42 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 15 Jun 2018 14:55:18 -0700
Subject: include/uapi/linux/blkzoned.h: Remove a superfluous __packed
 directive

Using the __packed directive for a structure that does not need
it is wrong because it makes gcc generate suboptimal code on some
architectures. Hence remove the __packed directive from the
blk_zone_report structure definition. See also
http://digitalvampire.org/blog/index.php/2006/07/31/why-you-shouldnt-use-__attribute__packed/.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Matias Bjorling <mb@lightnvm.io>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blkzoned.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index e3c70fe6bf0f..ff5a5db8906a 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -117,7 +117,7 @@ struct blk_zone_report {
 	__u32		nr_zones;
 	__u8		reserved[4];
 	struct blk_zone zones[0];
-} __packed;
+};
 
 /**
  * struct blk_zone_range - BLKRESETZONE ioctl request
-- 
cgit v1.2.3


From 52b509218f0ab5946f9cbaf5501d88f69333f0e3 Mon Sep 17 00:00:00 2001
From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Date: Mon, 9 Jul 2018 16:20:56 -0700
Subject: net: Use __u32 in uapi net_stamp.h

We are not supposed to use u32 in uapi, so change the flags member of
struct sock_txtime from u32 to __u32 instead.

Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/net_tstamp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index f8f4539f1135..97ff3c17ec4d 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -155,8 +155,8 @@ enum txtime_flags {
 };
 
 struct sock_txtime {
-	clockid_t       clockid;        /* reference clockid */
-	u32             flags;          /* flags defined by enum txtime_flags */
+	clockid_t	clockid;	/* reference clockid */
+	__u32		flags;		/* as defined by enum txtime_flags */
 };
 
 #endif /* _NET_TIMESTAMPING_H */
-- 
cgit v1.2.3


From e96d71359e9bbea846a2111e4469a03a055dfa6f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 9 Jul 2018 15:51:50 -0400
Subject: rseq: Use __u64 for rseq_cs fields, validate user inputs

Change the rseq ABI so rseq_cs start_ip, post_commit_offset and abort_ip
fields are seen as 64-bit fields by both 32-bit and 64-bit kernels rather
that ignoring the 32 upper bits on 32-bit kernels. This ensures we have a
consistent behavior for a 32-bit binary executed on 32-bit kernels and in
compat mode on 64-bit kernels.

Validating the value of abort_ip field to be below TASK_SIZE ensures the
kernel don't return to an invalid address when returning to userspace
after an abort. I don't fully trust each architecture code to consistently
deal with invalid return addresses.

Validating the value of the start_ip and post_commit_offset fields
prevents overflow on arithmetic performed on those values, used to
check whether abort_ip is within the rseq critical section.

If validation fails, the process is killed with a segmentation fault.

When the signature encountered before abort_ip does not match the expected
signature, return -EINVAL rather than -EPERM to be consistent with other
input validation return codes from rseq_get_rseq_cs().

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-api@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Watson <davejwatson@fb.com>
Cc: Paul Turner <pjt@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Chris Lameter <cl@linux.com>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://lkml.kernel.org/r/20180709195155.7654-2-mathieu.desnoyers@efficios.com
---
 include/uapi/linux/rseq.h |  6 +++---
 kernel/rseq.c             | 14 ++++++++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index d620fa43756c..519ad6e176d1 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -52,10 +52,10 @@ struct rseq_cs {
 	__u32 version;
 	/* enum rseq_cs_flags */
 	__u32 flags;
-	LINUX_FIELD_u32_u64(start_ip);
+	__u64 start_ip;
 	/* Offset from start_ip. */
-	LINUX_FIELD_u32_u64(post_commit_offset);
-	LINUX_FIELD_u32_u64(abort_ip);
+	__u64 post_commit_offset;
+	__u64 abort_ip;
 } __attribute__((aligned(4 * sizeof(__u64))));
 
 /*
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 22b6acf1ad63..16b38c5342f9 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -130,14 +130,20 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 	urseq_cs = (struct rseq_cs __user *)ptr;
 	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
 		return -EFAULT;
-	if (rseq_cs->version > 0)
-		return -EINVAL;
 
+	if (rseq_cs->start_ip >= TASK_SIZE ||
+	    rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
+	    rseq_cs->abort_ip >= TASK_SIZE ||
+	    rseq_cs->version > 0)
+		return -EINVAL;
+	/* Check for overflow. */
+	if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
+		return -EINVAL;
 	/* Ensure that abort_ip is not in the critical section. */
 	if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
 		return -EINVAL;
 
-	usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32));
+	usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
 	ret = get_user(sig, usig);
 	if (ret)
 		return ret;
@@ -146,7 +152,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 		printk_ratelimited(KERN_WARNING
 			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
 			sig, current->rseq_sig, current->pid, usig);
-		return -EPERM;
+		return -EINVAL;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 0fb9a1abc8c97f858997e962694eb36b4517144e Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 9 Jul 2018 15:51:52 -0400
Subject: rseq: uapi: Update uapi comments

Update rseq uapi header comments to reflect that user-space need to do
thread-local loads/stores from/to the struct rseq fields.

As a consequence of this added requirement, the kernel does not need
to perform loads/stores with single-copy atomicity.

Update the comment associated to the "flags" fields to describe
more accurately that it's only useful to facilitate single-stepping
through rseq critical sections with debuggers.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-api@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Watson <davejwatson@fb.com>
Cc: Paul Turner <pjt@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Chris Lameter <cl@linux.com>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://lkml.kernel.org/r/20180709195155.7654-4-mathieu.desnoyers@efficios.com
---
 include/uapi/linux/rseq.h | 69 ++++++++++++++++++++++++-----------------------
 kernel/rseq.c             |  2 +-
 2 files changed, 37 insertions(+), 34 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 519ad6e176d1..bf4188c13bec 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -67,28 +67,30 @@ struct rseq_cs {
 struct rseq {
 	/*
 	 * Restartable sequences cpu_id_start field. Updated by the
-	 * kernel, and read by user-space with single-copy atomicity
-	 * semantics. Aligned on 32-bit. Always contains a value in the
-	 * range of possible CPUs, although the value may not be the
-	 * actual current CPU (e.g. if rseq is not initialized). This
-	 * CPU number value should always be compared against the value
-	 * of the cpu_id field before performing a rseq commit or
-	 * returning a value read from a data structure indexed using
-	 * the cpu_id_start value.
+	 * kernel. Read by user-space with single-copy atomicity
+	 * semantics. This field should only be read by the thread which
+	 * registered this data structure. Aligned on 32-bit. Always
+	 * contains a value in the range of possible CPUs, although the
+	 * value may not be the actual current CPU (e.g. if rseq is not
+	 * initialized). This CPU number value should always be compared
+	 * against the value of the cpu_id field before performing a rseq
+	 * commit or returning a value read from a data structure indexed
+	 * using the cpu_id_start value.
 	 */
 	__u32 cpu_id_start;
 	/*
-	 * Restartable sequences cpu_id field. Updated by the kernel,
-	 * and read by user-space with single-copy atomicity semantics.
-	 * Aligned on 32-bit. Values RSEQ_CPU_ID_UNINITIALIZED and
-	 * RSEQ_CPU_ID_REGISTRATION_FAILED have a special semantic: the
-	 * former means "rseq uninitialized", and latter means "rseq
-	 * initialization failed". This value is meant to be read within
-	 * rseq critical sections and compared with the cpu_id_start
-	 * value previously read, before performing the commit instruction,
-	 * or read and compared with the cpu_id_start value before returning
-	 * a value loaded from a data structure indexed using the
-	 * cpu_id_start value.
+	 * Restartable sequences cpu_id field. Updated by the kernel.
+	 * Read by user-space with single-copy atomicity semantics. This
+	 * field should only be read by the thread which registered this
+	 * data structure. Aligned on 32-bit. Values
+	 * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED
+	 * have a special semantic: the former means "rseq uninitialized",
+	 * and latter means "rseq initialization failed". This value is
+	 * meant to be read within rseq critical sections and compared
+	 * with the cpu_id_start value previously read, before performing
+	 * the commit instruction, or read and compared with the
+	 * cpu_id_start value before returning a value loaded from a data
+	 * structure indexed using the cpu_id_start value.
 	 */
 	__u32 cpu_id;
 	/*
@@ -105,27 +107,28 @@ struct rseq {
 	 * targeted by the rseq_cs. Also needs to be set to NULL by user-space
 	 * before reclaiming memory that contains the targeted struct rseq_cs.
 	 *
-	 * Read and set by the kernel with single-copy atomicity semantics.
-	 * Set by user-space with single-copy atomicity semantics. Aligned
-	 * on 64-bit.
+	 * Read and set by the kernel. Set by user-space with single-copy
+	 * atomicity semantics. This field should only be updated by the
+	 * thread which registered this data structure. Aligned on 64-bit.
 	 */
 	LINUX_FIELD_u32_u64(rseq_cs);
 	/*
-	 * - RSEQ_DISABLE flag:
+	 * Restartable sequences flags field.
+	 *
+	 * This field should only be updated by the thread which
+	 * registered this data structure. Read by the kernel.
+	 * Mainly used for single-stepping through rseq critical sections
+	 * with debuggers.
 	 *
-	 * Fallback fast-track flag for single-stepping.
-	 * Set by user-space if lack of progress is detected.
-	 * Cleared by user-space after rseq finish.
-	 * Read by the kernel.
 	 * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
-	 *     Inhibit instruction sequence block restart and event
-	 *     counter increment on preemption for this thread.
+	 *     Inhibit instruction sequence block restart on preemption
+	 *     for this thread.
 	 * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
-	 *     Inhibit instruction sequence block restart and event
-	 *     counter increment on signal delivery for this thread.
+	 *     Inhibit instruction sequence block restart on signal
+	 *     delivery for this thread.
 	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
-	 *     Inhibit instruction sequence block restart and event
-	 *     counter increment on migration for this thread.
+	 *     Inhibit instruction sequence block restart on migration for
+	 *     this thread.
 	 */
 	__u32 flags;
 } __attribute__((aligned(4 * sizeof(__u64))));
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2c8463acb50d..2a7748675be7 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -201,7 +201,7 @@ static int clear_rseq_cs(struct task_struct *t)
 	 * of code outside of the rseq assembly block. This performs
 	 * a lazy clear of the rseq_cs field.
 	 *
-	 * Set rseq_cs to NULL with single-copy atomicity.
+	 * Set rseq_cs to NULL.
 	 */
 	return put_user(0UL, &t->rseq->rseq_cs);
 }
-- 
cgit v1.2.3


From ec9c82e03a744e5698bd95eab872855861a821fa Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 9 Jul 2018 15:51:53 -0400
Subject: rseq: uapi: Declare rseq_cs field as union, update includes

Declaring the rseq_cs field as a union between __u64 and two __u32
allows both 32-bit and 64-bit kernels to read the full __u64, and
therefore validate that a 32-bit user-space cleared the upper 32
bits, thus ensuring a consistent behavior between native 32-bit
kernels and 32-bit compat tasks on 64-bit kernels.

Check that the rseq_cs value read is < TASK_SIZE.

The asm/byteorder.h header needs to be included by rseq.h, now
that it is not using linux/types_32_64.h anymore.

Considering that only __32 and __u64 types are declared in linux/rseq.h,
the linux/types.h header should always be included for both kernel and
user-space code: including stdint.h is just for u64 and u32, which are
not used in this header at all.

Use copy_from_user()/clear_user() to interact with a 64-bit field,
because arm32 does not implement 64-bit __get_user, and ppc32 does not
64-bit get_user. Considering that the rseq_cs pointer does not need to
be loaded/stored with single-copy atomicity from the kernel anymore, we
can simply use copy_from_user()/clear_user().

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-api@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Watson <davejwatson@fb.com>
Cc: Paul Turner <pjt@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Chris Lameter <cl@linux.com>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://lkml.kernel.org/r/20180709195155.7654-5-mathieu.desnoyers@efficios.com
---
 include/uapi/linux/rseq.h           | 27 +++++++++++++++++++--------
 kernel/rseq.c                       | 15 +++++++++------
 tools/testing/selftests/rseq/rseq.h | 11 ++++++++++-
 3 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index bf4188c13bec..9a402fdb60e9 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -10,13 +10,8 @@
  * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  */
 
-#ifdef __KERNEL__
-# include <linux/types.h>
-#else
-# include <stdint.h>
-#endif
-
-#include <linux/types_32_64.h>
+#include <linux/types.h>
+#include <asm/byteorder.h>
 
 enum rseq_cpu_id_state {
 	RSEQ_CPU_ID_UNINITIALIZED		= -1,
@@ -111,7 +106,23 @@ struct rseq {
 	 * atomicity semantics. This field should only be updated by the
 	 * thread which registered this data structure. Aligned on 64-bit.
 	 */
-	LINUX_FIELD_u32_u64(rseq_cs);
+	union {
+		__u64 ptr64;
+#ifdef __LP64__
+		__u64 ptr;
+#else
+		struct {
+#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN)
+			__u32 padding;		/* Initialized to zero. */
+			__u32 ptr32;
+#else /* LITTLE */
+			__u32 ptr32;
+			__u32 padding;		/* Initialized to zero. */
+#endif /* ENDIAN */
+		} ptr;
+#endif
+	} rseq_cs;
+
 	/*
 	 * Restartable sequences flags field.
 	 *
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2a7748675be7..c6242d8594dc 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -115,19 +115,20 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
 static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 {
 	struct rseq_cs __user *urseq_cs;
-	unsigned long ptr;
+	u64 ptr;
 	u32 __user *usig;
 	u32 sig;
 	int ret;
 
-	ret = get_user(ptr, &t->rseq->rseq_cs);
-	if (ret)
-		return ret;
+	if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
+		return -EFAULT;
 	if (!ptr) {
 		memset(rseq_cs, 0, sizeof(*rseq_cs));
 		return 0;
 	}
-	urseq_cs = (struct rseq_cs __user *)ptr;
+	if (ptr >= TASK_SIZE)
+		return -EINVAL;
+	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
 	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
 		return -EFAULT;
 
@@ -203,7 +204,9 @@ static int clear_rseq_cs(struct task_struct *t)
 	 *
 	 * Set rseq_cs to NULL.
 	 */
-	return put_user(0UL, &t->rseq->rseq_cs);
+	if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
+		return -EFAULT;
+	return 0;
 }
 
 /*
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index a4684112676c..f2073cfa4448 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -133,6 +133,15 @@ static inline uint32_t rseq_current_cpu(void)
 	return cpu;
 }
 
+static inline void rseq_clear_rseq_cs(void)
+{
+#ifdef __LP64__
+	__rseq_abi.rseq_cs.ptr = 0;
+#else
+	__rseq_abi.rseq_cs.ptr.ptr32 = 0;
+#endif
+}
+
 /*
  * rseq_prepare_unload() should be invoked by each thread using rseq_finish*()
  * at least once between their last rseq_finish*() and library unload of the
@@ -143,7 +152,7 @@ static inline uint32_t rseq_current_cpu(void)
  */
 static inline void rseq_prepare_unload(void)
 {
-	__rseq_abi.rseq_cs = 0;
+	rseq_clear_rseq_cs();
 }
 
 #endif  /* RSEQ_H_ */
-- 
cgit v1.2.3


From 4f4c0acdf4652a964da869d578a3c8bf6df14ce2 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 9 Jul 2018 15:51:54 -0400
Subject: rseq: Remove unused types_32_64.h uapi header

This header was introduced in the 4.18 merge window, and rseq does
not need it anymore. Nuke it before the final release.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-api@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Watson <davejwatson@fb.com>
Cc: Paul Turner <pjt@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Chris Lameter <cl@linux.com>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://lkml.kernel.org/r/20180709195155.7654-6-mathieu.desnoyers@efficios.com
---
 include/uapi/linux/types_32_64.h | 50 ----------------------------------------
 1 file changed, 50 deletions(-)
 delete mode 100644 include/uapi/linux/types_32_64.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/types_32_64.h b/include/uapi/linux/types_32_64.h
deleted file mode 100644
index 0a87ace34a57..000000000000
--- a/include/uapi/linux/types_32_64.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
-#ifndef _UAPI_LINUX_TYPES_32_64_H
-#define _UAPI_LINUX_TYPES_32_64_H
-
-/*
- * linux/types_32_64.h
- *
- * Integer type declaration for pointers across 32-bit and 64-bit systems.
- *
- * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- */
-
-#ifdef __KERNEL__
-# include <linux/types.h>
-#else
-# include <stdint.h>
-#endif
-
-#include <asm/byteorder.h>
-
-#ifdef __BYTE_ORDER
-# if (__BYTE_ORDER == __BIG_ENDIAN)
-#  define LINUX_BYTE_ORDER_BIG_ENDIAN
-# else
-#  define LINUX_BYTE_ORDER_LITTLE_ENDIAN
-# endif
-#else
-# ifdef __BIG_ENDIAN
-#  define LINUX_BYTE_ORDER_BIG_ENDIAN
-# else
-#  define LINUX_BYTE_ORDER_LITTLE_ENDIAN
-# endif
-#endif
-
-#ifdef __LP64__
-# define LINUX_FIELD_u32_u64(field)			__u64 field
-# define LINUX_FIELD_u32_u64_INIT_ONSTACK(field, v)	field = (intptr_t)v
-#else
-# ifdef LINUX_BYTE_ORDER_BIG_ENDIAN
-#  define LINUX_FIELD_u32_u64(field)	__u32 field ## _padding, field
-#  define LINUX_FIELD_u32_u64_INIT_ONSTACK(field, v)	\
-	field ## _padding = 0, field = (intptr_t)v
-# else
-#  define LINUX_FIELD_u32_u64(field)	__u32 field, field ## _padding
-#  define LINUX_FIELD_u32_u64_INIT_ONSTACK(field, v)	\
-	field = (intptr_t)v, field ## _padding = 0
-# endif
-#endif
-
-#endif /* _UAPI_LINUX_TYPES_32_64_H */
-- 
cgit v1.2.3


From 046f6fd5daefac7f5abdafb436b30f63bc7c602b Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Fri, 6 Jul 2018 17:37:19 +0200
Subject: sched: Add Common Applications Kept Enhanced (cake) qdisc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sch_cake targets the home router use case and is intended to squeeze the
most bandwidth and latency out of even the slowest ISP links and routers,
while presenting an API simple enough that even an ISP can configure it.

Example of use on a cable ISP uplink:

tc qdisc add dev eth0 cake bandwidth 20Mbit nat docsis ack-filter

To shape a cable download link (ifb and tc-mirred setup elided)

tc qdisc add dev ifb0 cake bandwidth 200mbit nat docsis ingress wash

CAKE is filled with:

* A hybrid Codel/Blue AQM algorithm, "Cobalt", tied to an FQ_Codel
  derived Flow Queuing system, which autoconfigures based on the bandwidth.
* A novel "triple-isolate" mode (the default) which balances per-host
  and per-flow FQ even through NAT.
* An deficit based shaper, that can also be used in an unlimited mode.
* 8 way set associative hashing to reduce flow collisions to a minimum.
* A reasonable interpretation of various diffserv latency/loss tradeoffs.
* Support for zeroing diffserv markings for entering and exiting traffic.
* Support for interacting well with Docsis 3.0 shaper framing.
* Extensive support for DSL framing types.
* Support for ack filtering.
* Extensive statistics for measuring, loss, ecn markings, latency
  variation.

A paper describing the design of CAKE is available at
https://arxiv.org/abs/1804.07617, and will be published at the 2018 IEEE
International Symposium on Local and Metropolitan Area Networks (LANMAN).

This patch adds the base shaper and packet scheduler, while subsequent
commits add the optional (configurable) features. The full userspace API
and most data structures are included in this commit, but options not
understood in the base version will be ignored.

Various versions baking have been available as an out of tree build for
kernel versions going back to 3.10, as the embedded router world has been
running a few years behind mainline Linux. A stable version has been
generally available on lede-17.01 and later.

sch_cake replaces a combination of iptables, tc filter, htb and fq_codel
in the sqm-scripts, with sane defaults and vastly simpler configuration.

CAKE's principal author is Jonathan Morton, with contributions from
Kevin Darbyshire-Bryant, Toke Høiland-Jørgensen, Sebastian Moeller,
Ryan Mounce, Tony Ambardar, Dean Scarff, Nils Andreas Svee, Dave Täht,
and Loganaden Velvindron.

Testing from Pete Heist, Georgios Amanakis, and the many other members of
the cake@lists.bufferbloat.net mailing list.

tc -s qdisc show dev eth2
 qdisc cake 8017: root refcnt 2 bandwidth 1Gbit diffserv3 triple-isolate split-gso rtt 100.0ms noatm overhead 38 mpu 84
 Sent 51504294511 bytes 37724591 pkt (dropped 6, overlimits 64958695 requeues 12)
  backlog 0b 0p requeues 12
  memory used: 1053008b of 15140Kb
  capacity estimate: 970Mbit
  min/max network layer size:           28 /    1500
  min/max overhead-adjusted size:       84 /    1538
  average network hdr offset:           14
                    Bulk  Best Effort        Voice
   thresh      62500Kbit        1Gbit      250Mbit
   target          5.0ms        5.0ms        5.0ms
   interval      100.0ms      100.0ms      100.0ms
   pk_delay          5us          5us          6us
   av_delay          3us          2us          2us
   sp_delay          2us          1us          1us
   backlog            0b           0b           0b
   pkts          3164050     25030267      9530280
   bytes      3227519915  35396974782  12879808898
   way_inds            0            8            0
   way_miss           21          366           25
   way_cols            0            0            0
   drops               5            0            1
   marks               0            0            0
   ack_drop            0            0            0
   sp_flows            1            3            0
   bk_flows            0            1            1
   un_flows            0            0            0
   max_len         68130        68130        68130

Tested-by: Pete Heist <peteheist@gmail.com>
Tested-by: Georgios Amanakis <gamanakis@gmail.com>
Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  114 +++
 net/sched/Kconfig              |   11 +
 net/sched/Makefile             |    1 +
 net/sched/sch_cake.c           | 1867 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1993 insertions(+)
 create mode 100644 net/sched/sch_cake.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 949118461009..d9cc9dc4f547 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -955,4 +955,118 @@ enum {
 
 #define TCA_ETF_MAX (__TCA_ETF_MAX - 1)
 
+
+/* CAKE */
+enum {
+	TCA_CAKE_UNSPEC,
+	TCA_CAKE_PAD,
+	TCA_CAKE_BASE_RATE64,
+	TCA_CAKE_DIFFSERV_MODE,
+	TCA_CAKE_ATM,
+	TCA_CAKE_FLOW_MODE,
+	TCA_CAKE_OVERHEAD,
+	TCA_CAKE_RTT,
+	TCA_CAKE_TARGET,
+	TCA_CAKE_AUTORATE,
+	TCA_CAKE_MEMORY,
+	TCA_CAKE_NAT,
+	TCA_CAKE_RAW,
+	TCA_CAKE_WASH,
+	TCA_CAKE_MPU,
+	TCA_CAKE_INGRESS,
+	TCA_CAKE_ACK_FILTER,
+	TCA_CAKE_SPLIT_GSO,
+	__TCA_CAKE_MAX
+};
+#define TCA_CAKE_MAX	(__TCA_CAKE_MAX - 1)
+
+enum {
+	__TCA_CAKE_STATS_INVALID,
+	TCA_CAKE_STATS_PAD,
+	TCA_CAKE_STATS_CAPACITY_ESTIMATE64,
+	TCA_CAKE_STATS_MEMORY_LIMIT,
+	TCA_CAKE_STATS_MEMORY_USED,
+	TCA_CAKE_STATS_AVG_NETOFF,
+	TCA_CAKE_STATS_MIN_NETLEN,
+	TCA_CAKE_STATS_MAX_NETLEN,
+	TCA_CAKE_STATS_MIN_ADJLEN,
+	TCA_CAKE_STATS_MAX_ADJLEN,
+	TCA_CAKE_STATS_TIN_STATS,
+	TCA_CAKE_STATS_DEFICIT,
+	TCA_CAKE_STATS_COBALT_COUNT,
+	TCA_CAKE_STATS_DROPPING,
+	TCA_CAKE_STATS_DROP_NEXT_US,
+	TCA_CAKE_STATS_P_DROP,
+	TCA_CAKE_STATS_BLUE_TIMER_US,
+	__TCA_CAKE_STATS_MAX
+};
+#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1)
+
+enum {
+	__TCA_CAKE_TIN_STATS_INVALID,
+	TCA_CAKE_TIN_STATS_PAD,
+	TCA_CAKE_TIN_STATS_SENT_PACKETS,
+	TCA_CAKE_TIN_STATS_SENT_BYTES64,
+	TCA_CAKE_TIN_STATS_DROPPED_PACKETS,
+	TCA_CAKE_TIN_STATS_DROPPED_BYTES64,
+	TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS,
+	TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64,
+	TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS,
+	TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64,
+	TCA_CAKE_TIN_STATS_BACKLOG_PACKETS,
+	TCA_CAKE_TIN_STATS_BACKLOG_BYTES,
+	TCA_CAKE_TIN_STATS_THRESHOLD_RATE64,
+	TCA_CAKE_TIN_STATS_TARGET_US,
+	TCA_CAKE_TIN_STATS_INTERVAL_US,
+	TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS,
+	TCA_CAKE_TIN_STATS_WAY_MISSES,
+	TCA_CAKE_TIN_STATS_WAY_COLLISIONS,
+	TCA_CAKE_TIN_STATS_PEAK_DELAY_US,
+	TCA_CAKE_TIN_STATS_AVG_DELAY_US,
+	TCA_CAKE_TIN_STATS_BASE_DELAY_US,
+	TCA_CAKE_TIN_STATS_SPARSE_FLOWS,
+	TCA_CAKE_TIN_STATS_BULK_FLOWS,
+	TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS,
+	TCA_CAKE_TIN_STATS_MAX_SKBLEN,
+	TCA_CAKE_TIN_STATS_FLOW_QUANTUM,
+	__TCA_CAKE_TIN_STATS_MAX
+};
+#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1)
+#define TC_CAKE_MAX_TINS (8)
+
+enum {
+	CAKE_FLOW_NONE = 0,
+	CAKE_FLOW_SRC_IP,
+	CAKE_FLOW_DST_IP,
+	CAKE_FLOW_HOSTS,    /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */
+	CAKE_FLOW_FLOWS,
+	CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */
+	CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */
+	CAKE_FLOW_TRIPLE,   /* = CAKE_FLOW_HOSTS  | CAKE_FLOW_FLOWS */
+	CAKE_FLOW_MAX,
+};
+
+enum {
+	CAKE_DIFFSERV_DIFFSERV3 = 0,
+	CAKE_DIFFSERV_DIFFSERV4,
+	CAKE_DIFFSERV_DIFFSERV8,
+	CAKE_DIFFSERV_BESTEFFORT,
+	CAKE_DIFFSERV_PRECEDENCE,
+	CAKE_DIFFSERV_MAX
+};
+
+enum {
+	CAKE_ACK_NONE = 0,
+	CAKE_ACK_FILTER,
+	CAKE_ACK_AGGRESSIVE,
+	CAKE_ACK_MAX
+};
+
+enum {
+	CAKE_ATM_NONE = 0,
+	CAKE_ATM_ATM,
+	CAKE_ATM_PTM,
+	CAKE_ATM_MAX
+};
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index fcc89706745b..7af246764a35 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -295,6 +295,17 @@ config NET_SCH_FQ_CODEL
 
 	  If unsure, say N.
 
+config NET_SCH_CAKE
+	tristate "Common Applications Kept Enhanced (CAKE)"
+	help
+	  Say Y here if you want to use the Common Applications Kept Enhanced
+          (CAKE) queue management algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_cake.
+
+	  If unsure, say N.
+
 config NET_SCH_FQ
 	tristate "Fair Queue"
 	help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 9a5a7077d217..673ee7d26ff2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
 obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
 obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_CAKE)	+= sch_cake.o
 obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
new file mode 100644
index 000000000000..ea0272615d63
--- /dev/null
+++ b/net/sched/sch_cake.c
@@ -0,0 +1,1867 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/* COMMON Applications Kept Enhanced (CAKE) discipline
+ *
+ * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
+ * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
+ * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
+ * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
+ * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
+ * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
+ *
+ * The CAKE Principles:
+ *		   (or, how to have your cake and eat it too)
+ *
+ * This is a combination of several shaping, AQM and FQ techniques into one
+ * easy-to-use package:
+ *
+ * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
+ *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
+ *   eliminating the need for any sort of burst parameter (eg. token bucket
+ *   depth).  Burst support is limited to that necessary to overcome scheduling
+ *   latency.
+ *
+ * - A Diffserv-aware priority queue, giving more priority to certain classes,
+ *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
+ *   the priority is reduced to avoid starving other tins.
+ *
+ * - Each priority tin has a separate Flow Queue system, to isolate traffic
+ *   flows from each other.  This prevents a burst on one flow from increasing
+ *   the delay to another.  Flows are distributed to queues using a
+ *   set-associative hash function.
+ *
+ * - Each queue is actively managed by Cobalt, which is a combination of the
+ *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
+ *   congestion early via ECN (if available) and/or packet drops, to keep
+ *   latency low.  The codel parameters are auto-tuned based on the bandwidth
+ *   setting, as is necessary at low bandwidths.
+ *
+ * The configuration parameters are kept deliberately simple for ease of use.
+ * Everything has sane defaults.  Complete generality of configuration is *not*
+ * a goal.
+ *
+ * The priority queue operates according to a weighted DRR scheme, combined with
+ * a bandwidth tracker which reuses the shaper logic to detect which side of the
+ * bandwidth sharing threshold the tin is operating.  This determines whether a
+ * priority-based weight (high) or a bandwidth-based weight (low) is used for
+ * that tin in the current pass.
+ *
+ * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
+ * granted us permission to leverage.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/reciprocal_div.h>
+#include <net/netlink.h>
+#include <linux/version.h>
+#include <linux/if_vlan.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tcp.h>
+#include <net/flow_dissector.h>
+
+#define CAKE_SET_WAYS (8)
+#define CAKE_MAX_TINS (8)
+#define CAKE_QUEUES (1024)
+#define CAKE_FLOW_MASK 63
+#define CAKE_FLOW_NAT_FLAG 64
+
+/* struct cobalt_params - contains codel and blue parameters
+ * @interval:	codel initial drop rate
+ * @target:     maximum persistent sojourn time & blue update rate
+ * @mtu_time:   serialisation delay of maximum-size packet
+ * @p_inc:      increment of blue drop probability (0.32 fxp)
+ * @p_dec:      decrement of blue drop probability (0.32 fxp)
+ */
+struct cobalt_params {
+	u64	interval;
+	u64	target;
+	u64	mtu_time;
+	u32	p_inc;
+	u32	p_dec;
+};
+
+/* struct cobalt_vars - contains codel and blue variables
+ * @count:		codel dropping frequency
+ * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
+ * @drop_next:		time to drop next packet, or when we dropped last
+ * @blue_timer:		Blue time to next drop
+ * @p_drop:		BLUE drop probability (0.32 fxp)
+ * @dropping:		set if in dropping state
+ * @ecn_marked:		set if marked
+ */
+struct cobalt_vars {
+	u32	count;
+	u32	rec_inv_sqrt;
+	ktime_t	drop_next;
+	ktime_t	blue_timer;
+	u32     p_drop;
+	bool	dropping;
+	bool    ecn_marked;
+};
+
+enum {
+	CAKE_SET_NONE = 0,
+	CAKE_SET_SPARSE,
+	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
+	CAKE_SET_BULK,
+	CAKE_SET_DECAYING
+};
+
+struct cake_flow {
+	/* this stuff is all needed per-flow at dequeue time */
+	struct sk_buff	  *head;
+	struct sk_buff	  *tail;
+	struct list_head  flowchain;
+	s32		  deficit;
+	u32		  dropped;
+	struct cobalt_vars cvars;
+	u16		  srchost; /* index into cake_host table */
+	u16		  dsthost;
+	u8		  set;
+}; /* please try to keep this structure <= 64 bytes */
+
+struct cake_host {
+	u32 srchost_tag;
+	u32 dsthost_tag;
+	u16 srchost_refcnt;
+	u16 dsthost_refcnt;
+};
+
+struct cake_heap_entry {
+	u16 t:3, b:10;
+};
+
+struct cake_tin_data {
+	struct cake_flow flows[CAKE_QUEUES];
+	u32	backlogs[CAKE_QUEUES];
+	u32	tags[CAKE_QUEUES]; /* for set association */
+	u16	overflow_idx[CAKE_QUEUES];
+	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
+	u16	flow_quantum;
+
+	struct cobalt_params cparams;
+	u32	drop_overlimit;
+	u16	bulk_flow_count;
+	u16	sparse_flow_count;
+	u16	decaying_flow_count;
+	u16	unresponsive_flow_count;
+
+	u32	max_skblen;
+
+	struct list_head new_flows;
+	struct list_head old_flows;
+	struct list_head decaying_flows;
+
+	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
+	ktime_t	time_next_packet;
+	u64	tin_rate_ns;
+	u64	tin_rate_bps;
+	u16	tin_rate_shft;
+
+	u16	tin_quantum_prio;
+	u16	tin_quantum_band;
+	s32	tin_deficit;
+	u32	tin_backlog;
+	u32	tin_dropped;
+	u32	tin_ecn_mark;
+
+	u32	packets;
+	u64	bytes;
+
+	u32	ack_drops;
+
+	/* moving averages */
+	u64 avge_delay;
+	u64 peak_delay;
+	u64 base_delay;
+
+	/* hash function stats */
+	u32	way_directs;
+	u32	way_hits;
+	u32	way_misses;
+	u32	way_collisions;
+}; /* number of tins is small, so size of this struct doesn't matter much */
+
+struct cake_sched_data {
+	struct tcf_proto __rcu *filter_list; /* optional external classifier */
+	struct tcf_block *block;
+	struct cake_tin_data *tins;
+
+	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
+	u16		overflow_timeout;
+
+	u16		tin_cnt;
+	u8		tin_mode;
+	u8		flow_mode;
+	u8		ack_filter;
+	u8		atm_mode;
+
+	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
+	u16		rate_shft;
+	ktime_t		time_next_packet;
+	ktime_t		failsafe_next_packet;
+	u64		rate_ns;
+	u64		rate_bps;
+	u16		rate_flags;
+	s16		rate_overhead;
+	u16		rate_mpu;
+	u64		interval;
+	u64		target;
+
+	/* resource tracking */
+	u32		buffer_used;
+	u32		buffer_max_used;
+	u32		buffer_limit;
+	u32		buffer_config_limit;
+
+	/* indices for dequeue */
+	u16		cur_tin;
+	u16		cur_flow;
+
+	struct qdisc_watchdog watchdog;
+	const u8	*tin_index;
+	const u8	*tin_order;
+
+	/* bandwidth capacity estimate */
+	ktime_t		last_packet_time;
+	ktime_t		avg_window_begin;
+	u64		avg_packet_interval;
+	u64		avg_window_bytes;
+	u64		avg_peak_bandwidth;
+	ktime_t		last_reconfig_time;
+
+	/* packet length stats */
+	u32		avg_netoff;
+	u16		max_netlen;
+	u16		max_adjlen;
+	u16		min_netlen;
+	u16		min_adjlen;
+};
+
+enum {
+	CAKE_FLAG_OVERHEAD	   = BIT(0),
+	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
+	CAKE_FLAG_INGRESS	   = BIT(2),
+	CAKE_FLAG_WASH		   = BIT(3),
+	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
+};
+
+/* COBALT operates the Codel and BLUE algorithms in parallel, in order to
+ * obtain the best features of each.  Codel is excellent on flows which
+ * respond to congestion signals in a TCP-like way.  BLUE is more effective on
+ * unresponsive flows.
+ */
+
+struct cobalt_skb_cb {
+	ktime_t enqueue_time;
+};
+
+static u64 us_to_ns(u64 us)
+{
+	return us * NSEC_PER_USEC;
+}
+
+static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
+	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
+{
+	return get_cobalt_cb(skb)->enqueue_time;
+}
+
+static void cobalt_set_enqueue_time(struct sk_buff *skb,
+				    ktime_t now)
+{
+	get_cobalt_cb(skb)->enqueue_time = now;
+}
+
+static u16 quantum_div[CAKE_QUEUES + 1] = {0};
+
+#define REC_INV_SQRT_CACHE (16)
+static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
+
+/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
+ * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
+ *
+ * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
+ */
+
+static void cobalt_newton_step(struct cobalt_vars *vars)
+{
+	u32 invsqrt, invsqrt2;
+	u64 val;
+
+	invsqrt = vars->rec_inv_sqrt;
+	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
+	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
+
+	val >>= 2; /* avoid overflow in following multiply */
+	val = (val * invsqrt) >> (32 - 2 + 1);
+
+	vars->rec_inv_sqrt = val;
+}
+
+static void cobalt_invsqrt(struct cobalt_vars *vars)
+{
+	if (vars->count < REC_INV_SQRT_CACHE)
+		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
+	else
+		cobalt_newton_step(vars);
+}
+
+/* There is a big difference in timing between the accurate values placed in
+ * the cache and the approximations given by a single Newton step for small
+ * count values, particularly when stepping from count 1 to 2 or vice versa.
+ * Above 16, a single Newton step gives sufficient accuracy in either
+ * direction, given the precision stored.
+ *
+ * The magnitude of the error when stepping up to count 2 is such as to give
+ * the value that *should* have been produced at count 4.
+ */
+
+static void cobalt_cache_init(void)
+{
+	struct cobalt_vars v;
+
+	memset(&v, 0, sizeof(v));
+	v.rec_inv_sqrt = ~0U;
+	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
+
+	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
+		cobalt_newton_step(&v);
+		cobalt_newton_step(&v);
+		cobalt_newton_step(&v);
+		cobalt_newton_step(&v);
+
+		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
+	}
+}
+
+static void cobalt_vars_init(struct cobalt_vars *vars)
+{
+	memset(vars, 0, sizeof(*vars));
+
+	if (!cobalt_rec_inv_sqrt_cache[0]) {
+		cobalt_cache_init();
+		cobalt_rec_inv_sqrt_cache[0] = ~0;
+	}
+}
+
+/* CoDel control_law is t + interval/sqrt(count)
+ * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
+ * both sqrt() and divide operation.
+ */
+static ktime_t cobalt_control(ktime_t t,
+			      u64 interval,
+			      u32 rec_inv_sqrt)
+{
+	return ktime_add_ns(t, reciprocal_scale(interval,
+						rec_inv_sqrt));
+}
+
+/* Call this when a packet had to be dropped due to queue overflow.  Returns
+ * true if the BLUE state was quiescent before but active after this call.
+ */
+static bool cobalt_queue_full(struct cobalt_vars *vars,
+			      struct cobalt_params *p,
+			      ktime_t now)
+{
+	bool up = false;
+
+	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
+		up = !vars->p_drop;
+		vars->p_drop += p->p_inc;
+		if (vars->p_drop < p->p_inc)
+			vars->p_drop = ~0;
+		vars->blue_timer = now;
+	}
+	vars->dropping = true;
+	vars->drop_next = now;
+	if (!vars->count)
+		vars->count = 1;
+
+	return up;
+}
+
+/* Call this when the queue was serviced but turned out to be empty.  Returns
+ * true if the BLUE state was active before but quiescent after this call.
+ */
+static bool cobalt_queue_empty(struct cobalt_vars *vars,
+			       struct cobalt_params *p,
+			       ktime_t now)
+{
+	bool down = false;
+
+	if (vars->p_drop &&
+	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
+		if (vars->p_drop < p->p_dec)
+			vars->p_drop = 0;
+		else
+			vars->p_drop -= p->p_dec;
+		vars->blue_timer = now;
+		down = !vars->p_drop;
+	}
+	vars->dropping = false;
+
+	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
+		vars->count--;
+		cobalt_invsqrt(vars);
+		vars->drop_next = cobalt_control(vars->drop_next,
+						 p->interval,
+						 vars->rec_inv_sqrt);
+	}
+
+	return down;
+}
+
+/* Call this with a freshly dequeued packet for possible congestion marking.
+ * Returns true as an instruction to drop the packet, false for delivery.
+ */
+static bool cobalt_should_drop(struct cobalt_vars *vars,
+			       struct cobalt_params *p,
+			       ktime_t now,
+			       struct sk_buff *skb)
+{
+	bool next_due, over_target, drop = false;
+	ktime_t schedule;
+	u64 sojourn;
+
+/* The 'schedule' variable records, in its sign, whether 'now' is before or
+ * after 'drop_next'.  This allows 'drop_next' to be updated before the next
+ * scheduling decision is actually branched, without destroying that
+ * information.  Similarly, the first 'schedule' value calculated is preserved
+ * in the boolean 'next_due'.
+ *
+ * As for 'drop_next', we take advantage of the fact that 'interval' is both
+ * the delay between first exceeding 'target' and the first signalling event,
+ * *and* the scaling factor for the signalling frequency.  It's therefore very
+ * natural to use a single mechanism for both purposes, and eliminates a
+ * significant amount of reference Codel's spaghetti code.  To help with this,
+ * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
+ * as possible to 1.0 in fixed-point.
+ */
+
+	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
+	schedule = ktime_sub(now, vars->drop_next);
+	over_target = sojourn > p->target &&
+		      sojourn > p->mtu_time * 4;
+	next_due = vars->count && ktime_to_ns(schedule) >= 0;
+
+	vars->ecn_marked = false;
+
+	if (over_target) {
+		if (!vars->dropping) {
+			vars->dropping = true;
+			vars->drop_next = cobalt_control(now,
+							 p->interval,
+							 vars->rec_inv_sqrt);
+		}
+		if (!vars->count)
+			vars->count = 1;
+	} else if (vars->dropping) {
+		vars->dropping = false;
+	}
+
+	if (next_due && vars->dropping) {
+		/* Use ECN mark if possible, otherwise drop */
+		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
+
+		vars->count++;
+		if (!vars->count)
+			vars->count--;
+		cobalt_invsqrt(vars);
+		vars->drop_next = cobalt_control(vars->drop_next,
+						 p->interval,
+						 vars->rec_inv_sqrt);
+		schedule = ktime_sub(now, vars->drop_next);
+	} else {
+		while (next_due) {
+			vars->count--;
+			cobalt_invsqrt(vars);
+			vars->drop_next = cobalt_control(vars->drop_next,
+							 p->interval,
+							 vars->rec_inv_sqrt);
+			schedule = ktime_sub(now, vars->drop_next);
+			next_due = vars->count && ktime_to_ns(schedule) >= 0;
+		}
+	}
+
+	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
+	if (vars->p_drop)
+		drop |= (prandom_u32() < vars->p_drop);
+
+	/* Overload the drop_next field as an activity timeout */
+	if (!vars->count)
+		vars->drop_next = ktime_add_ns(now, p->interval);
+	else if (ktime_to_ns(schedule) > 0 && !drop)
+		vars->drop_next = now;
+
+	return drop;
+}
+
+/* Cake has several subtle multiple bit settings. In these cases you
+ *  would be matching triple isolate mode as well.
+ */
+
+static bool cake_dsrc(int flow_mode)
+{
+	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
+}
+
+static bool cake_ddst(int flow_mode)
+{
+	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
+}
+
+static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
+		     int flow_mode)
+{
+	u32 flow_hash = 0, srchost_hash, dsthost_hash;
+	u16 reduced_hash, srchost_idx, dsthost_idx;
+	struct flow_keys keys, host_keys;
+
+	if (unlikely(flow_mode == CAKE_FLOW_NONE))
+		return 0;
+
+	skb_flow_dissect_flow_keys(skb, &keys,
+				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+	/* flow_hash_from_keys() sorts the addresses by value, so we have
+	 * to preserve their order in a separate data structure to treat
+	 * src and dst host addresses as independently selectable.
+	 */
+	host_keys = keys;
+	host_keys.ports.ports     = 0;
+	host_keys.basic.ip_proto  = 0;
+	host_keys.keyid.keyid     = 0;
+	host_keys.tags.flow_label = 0;
+
+	switch (host_keys.control.addr_type) {
+	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		host_keys.addrs.v4addrs.src = 0;
+		dsthost_hash = flow_hash_from_keys(&host_keys);
+		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+		host_keys.addrs.v4addrs.dst = 0;
+		srchost_hash = flow_hash_from_keys(&host_keys);
+		break;
+
+	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+		memset(&host_keys.addrs.v6addrs.src, 0,
+		       sizeof(host_keys.addrs.v6addrs.src));
+		dsthost_hash = flow_hash_from_keys(&host_keys);
+		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+		memset(&host_keys.addrs.v6addrs.dst, 0,
+		       sizeof(host_keys.addrs.v6addrs.dst));
+		srchost_hash = flow_hash_from_keys(&host_keys);
+		break;
+
+	default:
+		dsthost_hash = 0;
+		srchost_hash = 0;
+	}
+
+	/* This *must* be after the above switch, since as a
+	 * side-effect it sorts the src and dst addresses.
+	 */
+	if (flow_mode & CAKE_FLOW_FLOWS)
+		flow_hash = flow_hash_from_keys(&keys);
+
+	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
+		if (flow_mode & CAKE_FLOW_SRC_IP)
+			flow_hash ^= srchost_hash;
+
+		if (flow_mode & CAKE_FLOW_DST_IP)
+			flow_hash ^= dsthost_hash;
+	}
+
+	reduced_hash = flow_hash % CAKE_QUEUES;
+
+	/* set-associative hashing */
+	/* fast path if no hash collision (direct lookup succeeds) */
+	if (likely(q->tags[reduced_hash] == flow_hash &&
+		   q->flows[reduced_hash].set)) {
+		q->way_directs++;
+	} else {
+		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
+		u32 outer_hash = reduced_hash - inner_hash;
+		bool allocate_src = false;
+		bool allocate_dst = false;
+		u32 i, k;
+
+		/* check if any active queue in the set is reserved for
+		 * this flow.
+		 */
+		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+		     i++, k = (k + 1) % CAKE_SET_WAYS) {
+			if (q->tags[outer_hash + k] == flow_hash) {
+				if (i)
+					q->way_hits++;
+
+				if (!q->flows[outer_hash + k].set) {
+					/* need to increment host refcnts */
+					allocate_src = cake_dsrc(flow_mode);
+					allocate_dst = cake_ddst(flow_mode);
+				}
+
+				goto found;
+			}
+		}
+
+		/* no queue is reserved for this flow, look for an
+		 * empty one.
+		 */
+		for (i = 0; i < CAKE_SET_WAYS;
+			 i++, k = (k + 1) % CAKE_SET_WAYS) {
+			if (!q->flows[outer_hash + k].set) {
+				q->way_misses++;
+				allocate_src = cake_dsrc(flow_mode);
+				allocate_dst = cake_ddst(flow_mode);
+				goto found;
+			}
+		}
+
+		/* With no empty queues, default to the original
+		 * queue, accept the collision, update the host tags.
+		 */
+		q->way_collisions++;
+		q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
+		q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
+		allocate_src = cake_dsrc(flow_mode);
+		allocate_dst = cake_ddst(flow_mode);
+found:
+		/* reserve queue for future packets in same flow */
+		reduced_hash = outer_hash + k;
+		q->tags[reduced_hash] = flow_hash;
+
+		if (allocate_src) {
+			srchost_idx = srchost_hash % CAKE_QUEUES;
+			inner_hash = srchost_idx % CAKE_SET_WAYS;
+			outer_hash = srchost_idx - inner_hash;
+			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+				i++, k = (k + 1) % CAKE_SET_WAYS) {
+				if (q->hosts[outer_hash + k].srchost_tag ==
+				    srchost_hash)
+					goto found_src;
+			}
+			for (i = 0; i < CAKE_SET_WAYS;
+				i++, k = (k + 1) % CAKE_SET_WAYS) {
+				if (!q->hosts[outer_hash + k].srchost_refcnt)
+					break;
+			}
+			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
+found_src:
+			srchost_idx = outer_hash + k;
+			q->hosts[srchost_idx].srchost_refcnt++;
+			q->flows[reduced_hash].srchost = srchost_idx;
+		}
+
+		if (allocate_dst) {
+			dsthost_idx = dsthost_hash % CAKE_QUEUES;
+			inner_hash = dsthost_idx % CAKE_SET_WAYS;
+			outer_hash = dsthost_idx - inner_hash;
+			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+			     i++, k = (k + 1) % CAKE_SET_WAYS) {
+				if (q->hosts[outer_hash + k].dsthost_tag ==
+				    dsthost_hash)
+					goto found_dst;
+			}
+			for (i = 0; i < CAKE_SET_WAYS;
+			     i++, k = (k + 1) % CAKE_SET_WAYS) {
+				if (!q->hosts[outer_hash + k].dsthost_refcnt)
+					break;
+			}
+			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
+found_dst:
+			dsthost_idx = outer_hash + k;
+			q->hosts[dsthost_idx].dsthost_refcnt++;
+			q->flows[reduced_hash].dsthost = dsthost_idx;
+		}
+	}
+
+	return reduced_hash;
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+/* remove one skb from head of slot queue */
+
+static struct sk_buff *dequeue_head(struct cake_flow *flow)
+{
+	struct sk_buff *skb = flow->head;
+
+	if (skb) {
+		flow->head = skb->next;
+		skb->next = NULL;
+	}
+
+	return skb;
+}
+
+/* add skb to flow queue (tail add) */
+
+static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
+{
+	if (!flow->head)
+		flow->head = skb;
+	else
+		flow->tail->next = skb;
+	flow->tail = skb;
+	skb->next = NULL;
+}
+
+static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
+{
+	avg -= avg >> shift;
+	avg += sample >> shift;
+	return avg;
+}
+
+static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
+{
+	struct cake_heap_entry ii = q->overflow_heap[i];
+	struct cake_heap_entry jj = q->overflow_heap[j];
+
+	q->overflow_heap[i] = jj;
+	q->overflow_heap[j] = ii;
+
+	q->tins[ii.t].overflow_idx[ii.b] = j;
+	q->tins[jj.t].overflow_idx[jj.b] = i;
+}
+
+static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
+{
+	struct cake_heap_entry ii = q->overflow_heap[i];
+
+	return q->tins[ii.t].backlogs[ii.b];
+}
+
+static void cake_heapify(struct cake_sched_data *q, u16 i)
+{
+	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
+	u32 mb = cake_heap_get_backlog(q, i);
+	u32 m = i;
+
+	while (m < a) {
+		u32 l = m + m + 1;
+		u32 r = l + 1;
+
+		if (l < a) {
+			u32 lb = cake_heap_get_backlog(q, l);
+
+			if (lb > mb) {
+				m  = l;
+				mb = lb;
+			}
+		}
+
+		if (r < a) {
+			u32 rb = cake_heap_get_backlog(q, r);
+
+			if (rb > mb) {
+				m  = r;
+				mb = rb;
+			}
+		}
+
+		if (m != i) {
+			cake_heap_swap(q, i, m);
+			i = m;
+		} else {
+			break;
+		}
+	}
+}
+
+static void cake_heapify_up(struct cake_sched_data *q, u16 i)
+{
+	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
+		u16 p = (i - 1) >> 1;
+		u32 ib = cake_heap_get_backlog(q, i);
+		u32 pb = cake_heap_get_backlog(q, p);
+
+		if (ib > pb) {
+			cake_heap_swap(q, i, p);
+			i = p;
+		} else {
+			break;
+		}
+	}
+}
+
+static int cake_advance_shaper(struct cake_sched_data *q,
+			       struct cake_tin_data *b,
+			       struct sk_buff *skb,
+			       ktime_t now, bool drop)
+{
+	u32 len = qdisc_pkt_len(skb);
+
+	/* charge packet bandwidth to this tin
+	 * and to the global shaper.
+	 */
+	if (q->rate_ns) {
+		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
+		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
+		u64 failsafe_dur = global_dur + (global_dur >> 1);
+
+		if (ktime_before(b->time_next_packet, now))
+			b->time_next_packet = ktime_add_ns(b->time_next_packet,
+							   tin_dur);
+
+		else if (ktime_before(b->time_next_packet,
+				      ktime_add_ns(now, tin_dur)))
+			b->time_next_packet = ktime_add_ns(now, tin_dur);
+
+		q->time_next_packet = ktime_add_ns(q->time_next_packet,
+						   global_dur);
+		if (!drop)
+			q->failsafe_next_packet = \
+				ktime_add_ns(q->failsafe_next_packet,
+					     failsafe_dur);
+	}
+	return len;
+}
+
+static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	ktime_t now = ktime_get();
+	u32 idx = 0, tin = 0, len;
+	struct cake_heap_entry qq;
+	struct cake_tin_data *b;
+	struct cake_flow *flow;
+	struct sk_buff *skb;
+
+	if (!q->overflow_timeout) {
+		int i;
+		/* Build fresh max-heap */
+		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
+			cake_heapify(q, i);
+	}
+	q->overflow_timeout = 65535;
+
+	/* select longest queue for pruning */
+	qq  = q->overflow_heap[0];
+	tin = qq.t;
+	idx = qq.b;
+
+	b = &q->tins[tin];
+	flow = &b->flows[idx];
+	skb = dequeue_head(flow);
+	if (unlikely(!skb)) {
+		/* heap has gone wrong, rebuild it next time */
+		q->overflow_timeout = 0;
+		return idx + (tin << 16);
+	}
+
+	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
+		b->unresponsive_flow_count++;
+
+	len = qdisc_pkt_len(skb);
+	q->buffer_used      -= skb->truesize;
+	b->backlogs[idx]    -= len;
+	b->tin_backlog      -= len;
+	sch->qstats.backlog -= len;
+	qdisc_tree_reduce_backlog(sch, 1, len);
+
+	flow->dropped++;
+	b->tin_dropped++;
+	sch->qstats.drops++;
+
+	__qdisc_drop(skb, to_free);
+	sch->q.qlen--;
+
+	cake_heapify(q, 0);
+
+	return idx + (tin << 16);
+}
+
+static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data *t,
+			 struct sk_buff *skb, int flow_mode, int *qerr)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	struct tcf_proto *filter;
+	struct tcf_result res;
+	int result;
+
+	filter = rcu_dereference_bh(q->filter_list);
+	if (!filter)
+		return cake_hash(t, skb, flow_mode) + 1;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	result = tcf_classify(skb, filter, &res, false);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+		case TC_ACT_TRAP:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+			/* fall through */
+		case TC_ACT_SHOT:
+			return 0;
+		}
+#endif
+		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
+			return TC_H_MIN(res.classid);
+	}
+	return 0;
+}
+
+static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			struct sk_buff **to_free)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	int len = qdisc_pkt_len(skb);
+	int uninitialized_var(ret);
+	ktime_t now = ktime_get();
+	struct cake_tin_data *b;
+	struct cake_flow *flow;
+	u32 idx, tin;
+
+	tin = 0;
+	b = &q->tins[tin];
+
+	/* choose flow to insert into */
+	idx = cake_classify(sch, b, skb, q->flow_mode, &ret);
+	if (idx == 0) {
+		if (ret & __NET_XMIT_BYPASS)
+			qdisc_qstats_drop(sch);
+		__qdisc_drop(skb, to_free);
+		return ret;
+	}
+	idx--;
+	flow = &b->flows[idx];
+
+	/* ensure shaper state isn't stale */
+	if (!b->tin_backlog) {
+		if (ktime_before(b->time_next_packet, now))
+			b->time_next_packet = now;
+
+		if (!sch->q.qlen) {
+			if (ktime_before(q->time_next_packet, now)) {
+				q->failsafe_next_packet = now;
+				q->time_next_packet = now;
+			} else if (ktime_after(q->time_next_packet, now) &&
+				   ktime_after(q->failsafe_next_packet, now)) {
+				u64 next = \
+					min(ktime_to_ns(q->time_next_packet),
+					    ktime_to_ns(
+						   q->failsafe_next_packet));
+				sch->qstats.overlimits++;
+				qdisc_watchdog_schedule_ns(&q->watchdog, next);
+			}
+		}
+	}
+
+	if (unlikely(len > b->max_skblen))
+		b->max_skblen = len;
+
+	cobalt_set_enqueue_time(skb, now);
+	flow_queue_add(flow, skb);
+
+	sch->q.qlen++;
+	q->buffer_used      += skb->truesize;
+
+	/* stats */
+	b->packets++;
+	b->bytes	    += len;
+	b->backlogs[idx]    += len;
+	b->tin_backlog      += len;
+	sch->qstats.backlog += len;
+	q->avg_window_bytes += len;
+
+	if (q->overflow_timeout)
+		cake_heapify_up(q, b->overflow_idx[idx]);
+
+	/* incoming bandwidth capacity estimate */
+	q->avg_window_bytes = 0;
+	q->last_packet_time = now;
+
+	/* flowchain */
+	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
+		struct cake_host *srchost = &b->hosts[flow->srchost];
+		struct cake_host *dsthost = &b->hosts[flow->dsthost];
+		u16 host_load = 1;
+
+		if (!flow->set) {
+			list_add_tail(&flow->flowchain, &b->new_flows);
+		} else {
+			b->decaying_flow_count--;
+			list_move_tail(&flow->flowchain, &b->new_flows);
+		}
+		flow->set = CAKE_SET_SPARSE;
+		b->sparse_flow_count++;
+
+		if (cake_dsrc(q->flow_mode))
+			host_load = max(host_load, srchost->srchost_refcnt);
+
+		if (cake_ddst(q->flow_mode))
+			host_load = max(host_load, dsthost->dsthost_refcnt);
+
+		flow->deficit = (b->flow_quantum *
+				 quantum_div[host_load]) >> 16;
+	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
+		/* this flow was empty, accounted as a sparse flow, but actually
+		 * in the bulk rotation.
+		 */
+		flow->set = CAKE_SET_BULK;
+		b->sparse_flow_count--;
+		b->bulk_flow_count++;
+	}
+
+	if (q->buffer_used > q->buffer_max_used)
+		q->buffer_max_used = q->buffer_used;
+
+	if (q->buffer_used > q->buffer_limit) {
+		u32 dropped = 0;
+
+		while (q->buffer_used > q->buffer_limit) {
+			dropped++;
+			cake_drop(sch, to_free);
+		}
+		b->drop_overlimit += dropped;
+	}
+	return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	struct cake_tin_data *b = &q->tins[q->cur_tin];
+	struct cake_flow *flow = &b->flows[q->cur_flow];
+	struct sk_buff *skb = NULL;
+	u32 len;
+
+	if (flow->head) {
+		skb = dequeue_head(flow);
+		len = qdisc_pkt_len(skb);
+		b->backlogs[q->cur_flow] -= len;
+		b->tin_backlog		 -= len;
+		sch->qstats.backlog      -= len;
+		q->buffer_used		 -= skb->truesize;
+		sch->q.qlen--;
+
+		if (q->overflow_timeout)
+			cake_heapify(q, b->overflow_idx[q->cur_flow]);
+	}
+	return skb;
+}
+
+/* Discard leftover packets from a tin no longer in use. */
+static void cake_clear_tin(struct Qdisc *sch, u16 tin)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	q->cur_tin = tin;
+	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
+		while (!!(skb = cake_dequeue_one(sch)))
+			kfree_skb(skb);
+}
+
+static struct sk_buff *cake_dequeue(struct Qdisc *sch)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	struct cake_tin_data *b = &q->tins[q->cur_tin];
+	struct cake_host *srchost, *dsthost;
+	ktime_t now = ktime_get();
+	struct cake_flow *flow;
+	struct list_head *head;
+	bool first_flow = true;
+	struct sk_buff *skb;
+	u16 host_load;
+	u64 delay;
+	u32 len;
+
+begin:
+	if (!sch->q.qlen)
+		return NULL;
+
+	/* global hard shaper */
+	if (ktime_after(q->time_next_packet, now) &&
+	    ktime_after(q->failsafe_next_packet, now)) {
+		u64 next = min(ktime_to_ns(q->time_next_packet),
+			       ktime_to_ns(q->failsafe_next_packet));
+
+		sch->qstats.overlimits++;
+		qdisc_watchdog_schedule_ns(&q->watchdog, next);
+		return NULL;
+	}
+
+	/* Choose a class to work on. */
+	if (!q->rate_ns) {
+		/* In unlimited mode, can't rely on shaper timings, just balance
+		 * with DRR
+		 */
+		bool wrapped = false, empty = true;
+
+		while (b->tin_deficit < 0 ||
+		       !(b->sparse_flow_count + b->bulk_flow_count)) {
+			if (b->tin_deficit <= 0)
+				b->tin_deficit += b->tin_quantum_band;
+			if (b->sparse_flow_count + b->bulk_flow_count)
+				empty = false;
+
+			q->cur_tin++;
+			b++;
+			if (q->cur_tin >= q->tin_cnt) {
+				q->cur_tin = 0;
+				b = q->tins;
+
+				if (wrapped) {
+					/* It's possible for q->qlen to be
+					 * nonzero when we actually have no
+					 * packets anywhere.
+					 */
+					if (empty)
+						return NULL;
+				} else {
+					wrapped = true;
+				}
+			}
+		}
+	} else {
+		/* In shaped mode, choose:
+		 * - Highest-priority tin with queue and meeting schedule, or
+		 * - The earliest-scheduled tin with queue.
+		 */
+		ktime_t best_time = KTIME_MAX;
+		int tin, best_tin = 0;
+
+		for (tin = 0; tin < q->tin_cnt; tin++) {
+			b = q->tins + tin;
+			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
+				ktime_t time_to_pkt = \
+					ktime_sub(b->time_next_packet, now);
+
+				if (ktime_to_ns(time_to_pkt) <= 0 ||
+				    ktime_compare(time_to_pkt,
+						  best_time) <= 0) {
+					best_time = time_to_pkt;
+					best_tin = tin;
+				}
+			}
+		}
+
+		q->cur_tin = best_tin;
+		b = q->tins + best_tin;
+
+		/* No point in going further if no packets to deliver. */
+		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
+			return NULL;
+	}
+
+retry:
+	/* service this class */
+	head = &b->decaying_flows;
+	if (!first_flow || list_empty(head)) {
+		head = &b->new_flows;
+		if (list_empty(head)) {
+			head = &b->old_flows;
+			if (unlikely(list_empty(head))) {
+				head = &b->decaying_flows;
+				if (unlikely(list_empty(head)))
+					goto begin;
+			}
+		}
+	}
+	flow = list_first_entry(head, struct cake_flow, flowchain);
+	q->cur_flow = flow - b->flows;
+	first_flow = false;
+
+	/* triple isolation (modified DRR++) */
+	srchost = &b->hosts[flow->srchost];
+	dsthost = &b->hosts[flow->dsthost];
+	host_load = 1;
+
+	if (cake_dsrc(q->flow_mode))
+		host_load = max(host_load, srchost->srchost_refcnt);
+
+	if (cake_ddst(q->flow_mode))
+		host_load = max(host_load, dsthost->dsthost_refcnt);
+
+	WARN_ON(host_load > CAKE_QUEUES);
+
+	/* flow isolation (DRR++) */
+	if (flow->deficit <= 0) {
+		/* The shifted prandom_u32() is a way to apply dithering to
+		 * avoid accumulating roundoff errors
+		 */
+		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
+				  (prandom_u32() >> 16)) >> 16;
+		list_move_tail(&flow->flowchain, &b->old_flows);
+
+		/* Keep all flows with deficits out of the sparse and decaying
+		 * rotations.  No non-empty flow can go into the decaying
+		 * rotation, so they can't get deficits
+		 */
+		if (flow->set == CAKE_SET_SPARSE) {
+			if (flow->head) {
+				b->sparse_flow_count--;
+				b->bulk_flow_count++;
+				flow->set = CAKE_SET_BULK;
+			} else {
+				/* we've moved it to the bulk rotation for
+				 * correct deficit accounting but we still want
+				 * to count it as a sparse flow, not a bulk one.
+				 */
+				flow->set = CAKE_SET_SPARSE_WAIT;
+			}
+		}
+		goto retry;
+	}
+
+	/* Retrieve a packet via the AQM */
+	while (1) {
+		skb = cake_dequeue_one(sch);
+		if (!skb) {
+			/* this queue was actually empty */
+			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
+				b->unresponsive_flow_count--;
+
+			if (flow->cvars.p_drop || flow->cvars.count ||
+			    ktime_before(now, flow->cvars.drop_next)) {
+				/* keep in the flowchain until the state has
+				 * decayed to rest
+				 */
+				list_move_tail(&flow->flowchain,
+					       &b->decaying_flows);
+				if (flow->set == CAKE_SET_BULK) {
+					b->bulk_flow_count--;
+					b->decaying_flow_count++;
+				} else if (flow->set == CAKE_SET_SPARSE ||
+					   flow->set == CAKE_SET_SPARSE_WAIT) {
+					b->sparse_flow_count--;
+					b->decaying_flow_count++;
+				}
+				flow->set = CAKE_SET_DECAYING;
+			} else {
+				/* remove empty queue from the flowchain */
+				list_del_init(&flow->flowchain);
+				if (flow->set == CAKE_SET_SPARSE ||
+				    flow->set == CAKE_SET_SPARSE_WAIT)
+					b->sparse_flow_count--;
+				else if (flow->set == CAKE_SET_BULK)
+					b->bulk_flow_count--;
+				else
+					b->decaying_flow_count--;
+
+				flow->set = CAKE_SET_NONE;
+				srchost->srchost_refcnt--;
+				dsthost->dsthost_refcnt--;
+			}
+			goto begin;
+		}
+
+		/* Last packet in queue may be marked, shouldn't be dropped */
+		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb) ||
+		    !flow->head)
+			break;
+
+		flow->dropped++;
+		b->tin_dropped++;
+		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
+		qdisc_qstats_drop(sch);
+		kfree_skb(skb);
+	}
+
+	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
+	qdisc_bstats_update(sch, skb);
+
+	/* collect delay stats */
+	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
+	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
+	b->peak_delay = cake_ewma(b->peak_delay, delay,
+				  delay > b->peak_delay ? 2 : 8);
+	b->base_delay = cake_ewma(b->base_delay, delay,
+				  delay < b->base_delay ? 2 : 8);
+
+	len = cake_advance_shaper(q, b, skb, now, false);
+	flow->deficit -= len;
+	b->tin_deficit -= len;
+
+	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
+		u64 next = min(ktime_to_ns(q->time_next_packet),
+			       ktime_to_ns(q->failsafe_next_packet));
+
+		qdisc_watchdog_schedule_ns(&q->watchdog, next);
+	} else if (!sch->q.qlen) {
+		int i;
+
+		for (i = 0; i < q->tin_cnt; i++) {
+			if (q->tins[i].decaying_flow_count) {
+				ktime_t next = \
+					ktime_add_ns(now,
+						     q->tins[i].cparams.target);
+
+				qdisc_watchdog_schedule_ns(&q->watchdog,
+							   ktime_to_ns(next));
+				break;
+			}
+		}
+	}
+
+	if (q->overflow_timeout)
+		q->overflow_timeout--;
+
+	return skb;
+}
+
+static void cake_reset(struct Qdisc *sch)
+{
+	u32 c;
+
+	for (c = 0; c < CAKE_MAX_TINS; c++)
+		cake_clear_tin(sch, c);
+}
+
+static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
+	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
+	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
+	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
+	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
+	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
+	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
+	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
+	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
+	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
+	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
+	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
+	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
+	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
+	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
+	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
+};
+
+static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
+			  u64 target_ns, u64 rtt_est_ns)
+{
+	/* convert byte-rate into time-per-byte
+	 * so it will always unwedge in reasonable time.
+	 */
+	static const u64 MIN_RATE = 64;
+	u32 byte_target = mtu;
+	u64 byte_target_ns;
+	u8  rate_shft = 0;
+	u64 rate_ns = 0;
+
+	b->flow_quantum = 1514;
+	if (rate) {
+		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
+		rate_shft = 34;
+		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
+		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
+		while (!!(rate_ns >> 34)) {
+			rate_ns >>= 1;
+			rate_shft--;
+		}
+	} /* else unlimited, ie. zero delay */
+
+	b->tin_rate_bps  = rate;
+	b->tin_rate_ns   = rate_ns;
+	b->tin_rate_shft = rate_shft;
+
+	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
+
+	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
+	b->cparams.interval = max(rtt_est_ns +
+				     b->cparams.target - target_ns,
+				     b->cparams.target * 2);
+	b->cparams.mtu_time = byte_target_ns;
+	b->cparams.p_inc = 1 << 24; /* 1/256 */
+	b->cparams.p_dec = 1 << 20; /* 1/4096 */
+}
+
+static void cake_reconfigure(struct Qdisc *sch)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	struct cake_tin_data *b = &q->tins[0];
+	int c, ft = 0;
+
+	q->tin_cnt = 1;
+	cake_set_rate(b, q->rate_bps, psched_mtu(qdisc_dev(sch)),
+		      us_to_ns(q->target), us_to_ns(q->interval));
+	b->tin_quantum_band = 65535;
+	b->tin_quantum_prio = 65535;
+
+	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
+		cake_clear_tin(sch, c);
+		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
+	}
+
+	q->rate_ns   = q->tins[ft].tin_rate_ns;
+	q->rate_shft = q->tins[ft].tin_rate_shft;
+
+	if (q->buffer_config_limit) {
+		q->buffer_limit = q->buffer_config_limit;
+	} else if (q->rate_bps) {
+		u64 t = q->rate_bps * q->interval;
+
+		do_div(t, USEC_PER_SEC / 4);
+		q->buffer_limit = max_t(u32, t, 4U << 20);
+	} else {
+		q->buffer_limit = ~0;
+	}
+
+	sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+	q->buffer_limit = min(q->buffer_limit,
+			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
+				  q->buffer_config_limit));
+}
+
+static int cake_change(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_CAKE_MAX + 1];
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_CAKE_BASE_RATE64])
+		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
+
+	if (tb[TCA_CAKE_FLOW_MODE])
+		q->flow_mode = (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
+				CAKE_FLOW_MASK);
+
+	if (tb[TCA_CAKE_RTT]) {
+		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
+
+		if (!q->interval)
+			q->interval = 1;
+	}
+
+	if (tb[TCA_CAKE_TARGET]) {
+		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
+
+		if (!q->target)
+			q->target = 1;
+	}
+
+	if (tb[TCA_CAKE_MEMORY])
+		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
+
+	if (q->tins) {
+		sch_tree_lock(sch);
+		cake_reconfigure(sch);
+		sch_tree_unlock(sch);
+	}
+
+	return 0;
+}
+
+static void cake_destroy(struct Qdisc *sch)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+
+	qdisc_watchdog_cancel(&q->watchdog);
+	tcf_block_put(q->block);
+	kvfree(q->tins);
+}
+
+static int cake_init(struct Qdisc *sch, struct nlattr *opt,
+		     struct netlink_ext_ack *extack)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	int i, j, err;
+
+	sch->limit = 10240;
+	q->tin_mode = CAKE_DIFFSERV_BESTEFFORT;
+	q->flow_mode  = CAKE_FLOW_TRIPLE;
+
+	q->rate_bps = 0; /* unlimited by default */
+
+	q->interval = 100000; /* 100ms default */
+	q->target   =   5000; /* 5ms: codel RFC argues
+			       * for 5 to 10% of interval
+			       */
+
+	q->cur_tin = 0;
+	q->cur_flow  = 0;
+
+	qdisc_watchdog_init(&q->watchdog, sch);
+
+	if (opt) {
+		int err = cake_change(sch, opt, extack);
+
+		if (err)
+			return err;
+	}
+
+	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+	if (err)
+		return err;
+
+	quantum_div[0] = ~0;
+	for (i = 1; i <= CAKE_QUEUES; i++)
+		quantum_div[i] = 65535 / i;
+
+	q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data),
+			   GFP_KERNEL);
+	if (!q->tins)
+		goto nomem;
+
+	for (i = 0; i < CAKE_MAX_TINS; i++) {
+		struct cake_tin_data *b = q->tins + i;
+
+		INIT_LIST_HEAD(&b->new_flows);
+		INIT_LIST_HEAD(&b->old_flows);
+		INIT_LIST_HEAD(&b->decaying_flows);
+		b->sparse_flow_count = 0;
+		b->bulk_flow_count = 0;
+		b->decaying_flow_count = 0;
+
+		for (j = 0; j < CAKE_QUEUES; j++) {
+			struct cake_flow *flow = b->flows + j;
+			u32 k = j * CAKE_MAX_TINS + i;
+
+			INIT_LIST_HEAD(&flow->flowchain);
+			cobalt_vars_init(&flow->cvars);
+
+			q->overflow_heap[k].t = i;
+			q->overflow_heap[k].b = j;
+			b->overflow_idx[j] = k;
+		}
+	}
+
+	cake_reconfigure(sch);
+	q->avg_peak_bandwidth = q->rate_bps;
+	q->min_netlen = ~0;
+	q->min_adjlen = ~0;
+	return 0;
+
+nomem:
+	cake_destroy(sch);
+	return -ENOMEM;
+}
+
+static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (!opts)
+		goto nla_put_failure;
+
+	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
+			      TCA_CAKE_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
+			q->flow_mode & CAKE_FLOW_MASK))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	return -1;
+}
+
+static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
+	struct cake_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tstats, *ts;
+	int i;
+
+	if (!stats)
+		return -1;
+
+#define PUT_STAT_U32(attr, data) do {				       \
+		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+			goto nla_put_failure;			       \
+	} while (0)
+#define PUT_STAT_U64(attr, data) do {				       \
+		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
+					data, TCA_CAKE_STATS_PAD)) \
+			goto nla_put_failure;			       \
+	} while (0)
+
+	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
+	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
+	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
+	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
+	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
+	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
+	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
+	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
+
+#undef PUT_STAT_U32
+#undef PUT_STAT_U64
+
+	tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
+	if (!tstats)
+		goto nla_put_failure;
+
+#define PUT_TSTAT_U32(attr, data) do {					\
+		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
+			goto nla_put_failure;				\
+	} while (0)
+#define PUT_TSTAT_U64(attr, data) do {					\
+		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
+					data, TCA_CAKE_TIN_STATS_PAD))	\
+			goto nla_put_failure;				\
+	} while (0)
+
+	for (i = 0; i < q->tin_cnt; i++) {
+		struct cake_tin_data *b = &q->tins[i];
+
+		ts = nla_nest_start(d->skb, i + 1);
+		if (!ts)
+			goto nla_put_failure;
+
+		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
+		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
+		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
+
+		PUT_TSTAT_U32(TARGET_US,
+			      ktime_to_us(ns_to_ktime(b->cparams.target)));
+		PUT_TSTAT_U32(INTERVAL_US,
+			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
+
+		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
+		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
+		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
+		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
+
+		PUT_TSTAT_U32(PEAK_DELAY_US,
+			      ktime_to_us(ns_to_ktime(b->peak_delay)));
+		PUT_TSTAT_U32(AVG_DELAY_US,
+			      ktime_to_us(ns_to_ktime(b->avge_delay)));
+		PUT_TSTAT_U32(BASE_DELAY_US,
+			      ktime_to_us(ns_to_ktime(b->base_delay)));
+
+		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
+		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
+		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
+
+		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
+					    b->decaying_flow_count);
+		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
+		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
+		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
+
+		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
+		nla_nest_end(d->skb, ts);
+	}
+
+#undef PUT_TSTAT_U32
+#undef PUT_TSTAT_U64
+
+	nla_nest_end(d->skb, tstats);
+	return nla_nest_end(d->skb, stats);
+
+nla_put_failure:
+	nla_nest_cancel(d->skb, stats);
+	return -1;
+}
+
+static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long cake_find(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
+			       u32 classid)
+{
+	return 0;
+}
+
+static void cake_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
+					struct netlink_ext_ack *extack)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return q->block;
+}
+
+static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
+			   struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				 struct gnet_dump *d)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	const struct cake_flow *flow = NULL;
+	struct gnet_stats_queue qs = { 0 };
+	struct nlattr *stats;
+	u32 idx = cl - 1;
+
+	if (idx < CAKE_QUEUES * q->tin_cnt) {
+		const struct cake_tin_data *b = &q->tins[idx / CAKE_QUEUES];
+		const struct sk_buff *skb;
+
+		flow = &b->flows[idx % CAKE_QUEUES];
+
+		if (flow->head) {
+			sch_tree_lock(sch);
+			skb = flow->head;
+			while (skb) {
+				qs.qlen++;
+				skb = skb->next;
+			}
+			sch_tree_unlock(sch);
+		}
+		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
+		qs.drops = flow->dropped;
+	}
+	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
+		return -1;
+	if (flow) {
+		ktime_t now = ktime_get();
+
+		stats = nla_nest_start(d->skb, TCA_STATS_APP);
+		if (!stats)
+			return -1;
+
+#define PUT_STAT_U32(attr, data) do {				       \
+		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+			goto nla_put_failure;			       \
+	} while (0)
+#define PUT_STAT_S32(attr, data) do {				       \
+		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+			goto nla_put_failure;			       \
+	} while (0)
+
+		PUT_STAT_S32(DEFICIT, flow->deficit);
+		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
+		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
+		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
+		if (flow->cvars.p_drop) {
+			PUT_STAT_S32(BLUE_TIMER_US,
+				     ktime_to_us(
+					     ktime_sub(now,
+						     flow->cvars.blue_timer)));
+		}
+		if (flow->cvars.dropping) {
+			PUT_STAT_S32(DROP_NEXT_US,
+				     ktime_to_us(
+					     ktime_sub(now,
+						       flow->cvars.drop_next)));
+		}
+
+		if (nla_nest_end(d->skb, stats) < 0)
+			return -1;
+	}
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(d->skb, stats);
+	return -1;
+}
+
+static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct cake_sched_data *q = qdisc_priv(sch);
+	unsigned int i, j;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->tin_cnt; i++) {
+		struct cake_tin_data *b = &q->tins[i];
+
+		for (j = 0; j < CAKE_QUEUES; j++) {
+			if (list_empty(&b->flows[j].flowchain) ||
+			    arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
+				arg->stop = 1;
+				break;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static const struct Qdisc_class_ops cake_class_ops = {
+	.leaf		=	cake_leaf,
+	.find		=	cake_find,
+	.tcf_block	=	cake_tcf_block,
+	.bind_tcf	=	cake_bind,
+	.unbind_tcf	=	cake_unbind,
+	.dump		=	cake_dump_class,
+	.dump_stats	=	cake_dump_class_stats,
+	.walk		=	cake_walk,
+};
+
+static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
+	.cl_ops		=	&cake_class_ops,
+	.id		=	"cake",
+	.priv_size	=	sizeof(struct cake_sched_data),
+	.enqueue	=	cake_enqueue,
+	.dequeue	=	cake_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	cake_init,
+	.reset		=	cake_reset,
+	.destroy	=	cake_destroy,
+	.change		=	cake_change,
+	.dump		=	cake_dump,
+	.dump_stats	=	cake_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init cake_module_init(void)
+{
+	return register_qdisc(&cake_qdisc_ops);
+}
+
+static void __exit cake_module_exit(void)
+{
+	unregister_qdisc(&cake_qdisc_ops);
+}
+
+module_init(cake_module_init)
+module_exit(cake_module_exit)
+MODULE_AUTHOR("Jonathan Morton");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("The CAKE shaper.");
-- 
cgit v1.2.3


From 2640c3facbd6e21e63c95f19588cc24913a263cd Mon Sep 17 00:00:00 2001
From: shaoyunl <Shaoyun.Liu@amd.com>
Date: Wed, 11 Jul 2018 22:32:50 -0400
Subject: drm/amdkfd: Handle VM faults in KFD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Pre-GFX9 the amdgpu ISR saves the vm-fault status and address per
   per-vmid. amdkfd needs to get the information from amdgpu through the
   new get_vm_fault_info interface. On GFX9 and later, all the required
   information is in the IH ring
2. amdkfd unmaps all queues from the faulting process and create new
   run-list without the guilty process
3. amdkfd notifies the runtime of the vm fault trap via EVENT_TYPE_MEMORY

Signed-off-by: shaoyun liu <shaoyun.liu@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c   | 25 ++++++++++++---
 drivers/gpu/drm/amd/amdkfd/cik_int.h               |  2 ++
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 17 ++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c            | 37 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c    | 18 +++++++++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |  4 +++
 include/uapi/linux/kfd_ioctl.h                     |  2 +-
 7 files changed, 98 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 49df6c791cfc..cc33870e7edb 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 	return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
 		ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
 		ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
-		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE;
+		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
+		ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
 }
 
 static void cik_event_interrupt_wq(struct kfd_dev *dev,
 					const uint32_t *ih_ring_entry)
 {
-	unsigned int pasid;
 	const struct cik_ih_ring_entry *ihre =
 			(const struct cik_ih_ring_entry *)ih_ring_entry;
 	uint32_t context_id = ihre->data & 0xfffffff;
-
-	pasid = (ihre->ring_id & 0xffff0000) >> 16;
+	unsigned int vmid  = (ihre->ring_id & 0x0000ff00) >> 8;
+	unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;
 
 	if (pasid == 0)
 		return;
@@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 		kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
 	else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
 		kfd_signal_hw_exception_event(pasid);
+	else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
+		struct kfd_vm_fault_info info;
+
+		kfd_process_vm_fault(dev->dqm, pasid);
+
+		memset(&info, 0, sizeof(info));
+		dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
+		if (!info.page_addr && !info.status)
+			return;
+
+		if (info.vmid == vmid)
+			kfd_signal_vm_fault_event(dev, pasid, &info);
+		else
+			kfd_signal_vm_fault_event(dev, pasid, NULL);
+	}
 }
 
 const struct kfd_event_interrupt_class event_interrupt_class_cik = {
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h
index 109298b9d507..a2079a04a673 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_int.h
+++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h
@@ -37,6 +37,8 @@ struct cik_ih_ring_entry {
 #define CIK_INTSRC_DEQUEUE_COMPLETE	0xC6
 #define CIK_INTSRC_SDMA_TRAP		0xE0
 #define CIK_INTSRC_SQ_INTERRUPT_MSG	0xEF
+#define CIK_INTSRC_GFX_PAGE_INV_FAULT	0x92
+#define CIK_INTSRC_GFX_MEM_PROT_FAULT	0x93
 
 #endif
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f2f81d26db0c..44fc2038770e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
 	kfree(dqm);
 }
 
+int kfd_process_vm_fault(struct device_queue_manager *dqm,
+			 unsigned int pasid)
+{
+	struct kfd_process_device *pdd;
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+	int ret = 0;
+
+	if (!p)
+		return -EINVAL;
+	pdd = kfd_get_process_device_data(dqm->dev, p);
+	if (pdd)
+		ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+	kfd_unref_process(p);
+
+	return ret;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 3d5a8332e8c0..b58a0e665ebc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
 	mutex_unlock(&p->event_mutex);
 	kfd_unref_process(p);
 }
+
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+				struct kfd_vm_fault_info *info)
+{
+	struct kfd_event *ev;
+	uint32_t id;
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+	struct kfd_hsa_memory_exception_data memory_exception_data;
+
+	if (!p)
+		return; /* Presumably process exited. */
+	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+	memory_exception_data.gpu_id = dev->id;
+	memory_exception_data.failure.imprecise = 1;
+	/* Set failure reason */
+	if (info) {
+		memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
+		memory_exception_data.failure.NotPresent =
+			info->prot_valid ? 1 : 0;
+		memory_exception_data.failure.NoExecute =
+			info->prot_exec ? 1 : 0;
+		memory_exception_data.failure.ReadOnly =
+			info->prot_write ? 1 : 0;
+		memory_exception_data.failure.imprecise = 0;
+	}
+	mutex_lock(&p->event_mutex);
+
+	id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+	idr_for_each_entry_continue(&p->event_idr, ev, id)
+		if (ev->type == KFD_EVENT_TYPE_MEMORY) {
+			ev->memory_exception_data = memory_exception_data;
+			set_event(ev);
+		}
+
+	mutex_unlock(&p->event_mutex);
+	kfd_unref_process(p);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 37029baa3346..d6b64e692760 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 	return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
 		source_id == SOC15_INTSRC_SDMA_TRAP ||
 		source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
-		source_id == SOC15_INTSRC_CP_BAD_OPCODE;
+		source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
+		client_id == SOC15_IH_CLIENTID_VMC ||
+		client_id == SOC15_IH_CLIENTID_UTCL2;
 }
 
 static void event_interrupt_wq_v9(struct kfd_dev *dev,
@@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		kfd_signal_hw_exception_event(pasid);
 	else if (client_id == SOC15_IH_CLIENTID_VMC ||
 		 client_id == SOC15_IH_CLIENTID_UTCL2) {
-		/* TODO */
+		struct kfd_vm_fault_info info = {0};
+		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+
+		info.vmid = vmid;
+		info.mc_id = client_id;
+		info.page_addr = ih_ring_entry[4] |
+			(uint64_t)(ih_ring_entry[5] & 0xf) << 32;
+		info.prot_valid = ring_id & 0x08;
+		info.prot_read  = ring_id & 0x10;
+		info.prot_write = ring_id & 0x20;
+
+		kfd_process_vm_fault(dev->dqm, pasid);
+		kfd_signal_vm_fault_event(dev, pasid, &info);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 5e3990bb4c4b..91a3368421b1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -838,6 +838,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm);
 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 					enum kfd_queue_type type);
 void kernel_queue_uninit(struct kernel_queue *kq);
+int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
 
 /* Process Queue Manager */
 struct process_queue_node {
@@ -964,6 +965,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
 		     uint64_t *event_page_offset, uint32_t *event_slot_index);
 int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
 
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+				struct kfd_vm_fault_info *info);
+
 void kfd_flush_tlb(struct kfd_process_device *pdd);
 
 int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index b4f5073dbac2..46a54ab1e728 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -219,7 +219,7 @@ struct kfd_memory_exception_failure {
 	__u32 NotPresent;	/* Page not present or supervisor privilege */
 	__u32 ReadOnly;	/* Write access to a read-only page */
 	__u32 NoExecute;	/* Execute access to a page marked NX */
-	__u32 pad;
+	__u32 imprecise;	/* Can't determine the	exact fault address */
 };
 
 /* memory exception data*/
-- 
cgit v1.2.3


From 0c119abad7f0d7987c3ce4ea76b30bde76d0436e Mon Sep 17 00:00:00 2001
From: Shaoyun Liu <Shaoyun.Liu@amd.com>
Date: Wed, 11 Jul 2018 22:32:53 -0400
Subject: drm/amd: Add kfd ioctl defines for hw_exception event
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Shaoyun Liu <Shaoyun.Liu@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 include/uapi/linux/kfd_ioctl.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 46a54ab1e728..88d17c39dbf9 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -189,6 +189,15 @@ struct kfd_ioctl_dbg_wave_control_args {
 
 #define KFD_SIGNAL_EVENT_LIMIT			4096
 
+/* For kfd_event_data.hw_exception_data.reset_type. */
+#define KFD_HW_EXCEPTION_WHOLE_GPU_RESET	0
+#define KFD_HW_EXCEPTION_PER_ENGINE_RESET	1
+
+/* For kfd_event_data.hw_exception_data.reset_cause. */
+#define KFD_HW_EXCEPTION_GPU_HANG	0
+#define KFD_HW_EXCEPTION_ECC		1
+
+
 struct kfd_ioctl_create_event_args {
 	__u64 event_page_offset;	/* from KFD */
 	__u32 event_trigger_data;	/* from KFD - signal events only */
@@ -230,10 +239,19 @@ struct kfd_hsa_memory_exception_data {
 	__u32 pad;
 };
 
-/* Event data*/
+/* hw exception data */
+struct kfd_hsa_hw_exception_data {
+	uint32_t reset_type;
+	uint32_t reset_cause;
+	uint32_t memory_lost;
+	uint32_t gpu_id;
+};
+
+/* Event data */
 struct kfd_event_data {
 	union {
 		struct kfd_hsa_memory_exception_data memory_exception_data;
+		struct kfd_hsa_hw_exception_data hw_exception_data;
 	};				/* From KFD */
 	__u64 kfd_event_data_ext;	/* pointer to an extension structure
 					   for future exception types */
-- 
cgit v1.2.3


From 2bae79d2d38f3dc50bfef81d3b4f7328b2883a17 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Thu, 12 Jul 2018 12:52:22 +0100
Subject: bpf: fix documentation for eBPF helpers

Minor formatting edits for eBPF helpers documentation, including blank
lines removal, fix of item list for return values in bpf_fib_lookup(),
and missing prefix on bpf_skb_load_bytes_relative().

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b7db3261c62d..6bcb287a888d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1826,7 +1826,7 @@ union bpf_attr {
  * 		A non-negative value equal to or less than *size* on success,
  * 		or a negative error in case of failure.
  *
- * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
  * 		This helper is similar to **bpf_skb_load_bytes**\ () in that
  * 		it provides an easy way to load *len* bytes from *offset*
@@ -1877,7 +1877,7 @@ union bpf_attr {
  *		* < 0 if any input argument is invalid
  *		*   0 on success (packet is forwarded, nexthop neighbor exists)
  *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
- *		*     packet is not forwarded or needs assist from full stack
+ *		  packet is not forwarded or needs assist from full stack
  *
  * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
  *	Description
@@ -2033,7 +2033,6 @@ union bpf_attr {
  *		This helper is only available is the kernel was compiled with
  *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
  *		"**y**".
- *
  *	Return
  *		0
  *
@@ -2053,7 +2052,6 @@ union bpf_attr {
  *		This helper is only available is the kernel was compiled with
  *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
  *		"**y**".
- *
  *	Return
  *		0
  *
-- 
cgit v1.2.3


From d8db7ea55f2ff5890ad31137233a3808d80c7f62 Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Thu, 12 Jul 2018 15:13:11 +0300
Subject: devlink: Add support for region get command

Add support for DEVLINK_CMD_REGION_GET command which is used for
querying for the supported DEV/REGION values of devlink devices.
The support is both for doit and dumpit.

Reply includes:
  BUS_NAME, DEVICE_NAME, REGION_NAME, REGION_SIZE

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |   6 +++
 net/core/devlink.c           | 114 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 68641fb56654..28bfa8aa3d91 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -83,6 +83,9 @@ enum devlink_command {
 	DEVLINK_CMD_PARAM_NEW,
 	DEVLINK_CMD_PARAM_DEL,
 
+	DEVLINK_CMD_REGION_GET,
+	DEVLINK_CMD_REGION_SET,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -262,6 +265,9 @@ enum devlink_attr {
 	DEVLINK_ATTR_PARAM_VALUE_DATA,		/* dynamic */
 	DEVLINK_ATTR_PARAM_VALUE_CMODE,		/* u8 */
 
+	DEVLINK_ATTR_REGION_NAME,               /* string */
+	DEVLINK_ATTR_REGION_SIZE,               /* u64 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 7d09fe60fa4b..221ddb6bae48 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3149,6 +3149,111 @@ static void devlink_param_unregister_one(struct devlink *devlink,
 	kfree(param_item);
 }
 
+static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
+				  enum devlink_command cmd, u32 portid,
+				  u32 seq, int flags,
+				  struct devlink_region *region)
+{
+	void *hdr;
+	int err;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	err = devlink_nl_put_handle(msg, devlink);
+	if (err)
+		goto nla_put_failure;
+
+	err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name);
+	if (err)
+		goto nla_put_failure;
+
+	err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
+				region->size,
+				DEVLINK_ATTR_PAD);
+	if (err)
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
+static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
+					  struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_region *region;
+	const char *region_name;
+	struct sk_buff *msg;
+	int err;
+
+	if (!info->attrs[DEVLINK_ATTR_REGION_NAME])
+		return -EINVAL;
+
+	region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+	region = devlink_region_get_by_name(devlink, region_name);
+	if (!region)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET,
+				     info->snd_portid, info->snd_seq, 0,
+				     region);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
+					    struct netlink_callback *cb)
+{
+	struct devlink_region *region;
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+
+		mutex_lock(&devlink->lock);
+		list_for_each_entry(region, &devlink->region_list, list) {
+			if (idx < start) {
+				idx++;
+				continue;
+			}
+			err = devlink_nl_region_fill(msg, devlink,
+						     DEVLINK_CMD_REGION_GET,
+						     NETLINK_CB(cb->skb).portid,
+						     cb->nlh->nlmsg_seq,
+						     NLM_F_MULTI, region);
+			if (err) {
+				mutex_unlock(&devlink->lock);
+				goto out;
+			}
+			idx++;
+		}
+		mutex_unlock(&devlink->lock);
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+	cb->args[0] = idx;
+	return msg->len;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -3172,6 +3277,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
+	[DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -3370,6 +3476,14 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_REGION_GET,
+		.doit = devlink_nl_cmd_region_get_doit,
+		.dumpit = devlink_nl_cmd_region_get_dumpit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3


From a006d467fbf1d405e73cd167829d7a9e3df600e3 Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Thu, 12 Jul 2018 15:13:12 +0300
Subject: devlink: Extend the support querying for region snapshot IDs

Extend the support for DEVLINK_CMD_REGION_GET command to also
return the IDs of the snapshot currently present on the region.
Each reply will include a nested snapshots attribute that
can contain multiple snapshot attributes each with an ID.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  3 +++
 net/core/devlink.c           | 53 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 28bfa8aa3d91..abde4e306375 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -267,6 +267,9 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_REGION_NAME,               /* string */
 	DEVLINK_ATTR_REGION_SIZE,               /* u64 */
+	DEVLINK_ATTR_REGION_SNAPSHOTS,          /* nested */
+	DEVLINK_ATTR_REGION_SNAPSHOT,           /* nested */
+	DEVLINK_ATTR_REGION_SNAPSHOT_ID,        /* u32 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 221ddb6bae48..cb75e26d70ff 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3149,6 +3149,55 @@ static void devlink_param_unregister_one(struct devlink *devlink,
 	kfree(param_item);
 }
 
+static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
+					     struct devlink *devlink,
+					     struct devlink_snapshot *snapshot)
+{
+	struct nlattr *snap_attr;
+	int err;
+
+	snap_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOT);
+	if (!snap_attr)
+		return -EINVAL;
+
+	err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id);
+	if (err)
+		goto nla_put_failure;
+
+	nla_nest_end(msg, snap_attr);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, snap_attr);
+	return err;
+}
+
+static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg,
+					      struct devlink *devlink,
+					      struct devlink_region *region)
+{
+	struct devlink_snapshot *snapshot;
+	struct nlattr *snapshots_attr;
+	int err;
+
+	snapshots_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOTS);
+	if (!snapshots_attr)
+		return -EINVAL;
+
+	list_for_each_entry(snapshot, &region->snapshot_list, list) {
+		err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot);
+		if (err)
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(msg, snapshots_attr);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, snapshots_attr);
+	return err;
+}
+
 static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
 				  enum devlink_command cmd, u32 portid,
 				  u32 seq, int flags,
@@ -3175,6 +3224,10 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
 	if (err)
 		goto nla_put_failure;
 
+	err = devlink_nl_region_snapshots_id_put(msg, devlink, region);
+	if (err)
+		goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
-- 
cgit v1.2.3


From 866319bb9437614407ca36f8b16f89ab77a6a831 Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Thu, 12 Jul 2018 15:13:13 +0300
Subject: devlink: Add support for region snapshot delete command

Add support for DEVLINK_CMD_REGION_DEL used
for deleting a snapshot from a region. The snapshot ID is required.
Also added notification support for NEW and DEL of snapshots.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |  2 +
 net/core/devlink.c           | 93 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index abde4e306375..d212e02f843f 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -85,6 +85,8 @@ enum devlink_command {
 
 	DEVLINK_CMD_REGION_GET,
 	DEVLINK_CMD_REGION_SET,
+	DEVLINK_CMD_REGION_NEW,
+	DEVLINK_CMD_REGION_DEL,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index cb75e26d70ff..fc0836371a71 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3236,6 +3236,58 @@ nla_put_failure:
 	return err;
 }
 
+static void devlink_nl_region_notify(struct devlink_region *region,
+				     struct devlink_snapshot *snapshot,
+				     enum devlink_command cmd)
+{
+	struct devlink *devlink = region->devlink;
+	struct sk_buff *msg;
+	void *hdr;
+	int err;
+
+	WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+	if (!hdr)
+		goto out_free_msg;
+
+	err = devlink_nl_put_handle(msg, devlink);
+	if (err)
+		goto out_cancel_msg;
+
+	err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
+			     region->name);
+	if (err)
+		goto out_cancel_msg;
+
+	if (snapshot) {
+		err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID,
+				  snapshot->id);
+		if (err)
+			goto out_cancel_msg;
+	} else {
+		err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
+					region->size, DEVLINK_ATTR_PAD);
+		if (err)
+			goto out_cancel_msg;
+	}
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+
+	return;
+
+out_cancel_msg:
+	genlmsg_cancel(msg, hdr);
+out_free_msg:
+	nlmsg_free(msg);
+}
+
 static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
 					  struct genl_info *info)
 {
@@ -3307,6 +3359,35 @@ out:
 	return msg->len;
 }
 
+static int devlink_nl_cmd_region_del(struct sk_buff *skb,
+				     struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_snapshot *snapshot;
+	struct devlink_region *region;
+	const char *region_name;
+	u32 snapshot_id;
+
+	if (!info->attrs[DEVLINK_ATTR_REGION_NAME] ||
+	    !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
+		return -EINVAL;
+
+	region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+	snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+
+	region = devlink_region_get_by_name(devlink, region_name);
+	if (!region)
+		return -EINVAL;
+
+	snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+	if (!snapshot)
+		return -EINVAL;
+
+	devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
+	devlink_region_snapshot_del(snapshot);
+	return 0;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -3331,6 +3412,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -3537,6 +3619,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_REGION_DEL,
+		.doit = devlink_nl_cmd_region_del,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
@@ -4363,6 +4452,7 @@ struct devlink_region *devlink_region_create(struct devlink *devlink,
 	region->size = region_size;
 	INIT_LIST_HEAD(&region->snapshot_list);
 	list_add_tail(&region->list, &devlink->region_list);
+	devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
 
 	mutex_unlock(&devlink->lock);
 	return region;
@@ -4390,6 +4480,8 @@ void devlink_region_destroy(struct devlink_region *region)
 		devlink_region_snapshot_del(snapshot);
 
 	list_del(&region->list);
+
+	devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
 	mutex_unlock(&devlink->lock);
 	kfree(region);
 }
@@ -4467,6 +4559,7 @@ int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
 
 	region->cur_snapshots++;
 
+	devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
 	mutex_unlock(&devlink->lock);
 	return 0;
 
-- 
cgit v1.2.3


From 4e54795a27f56102649f121a34b8445e42f79ccd Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Thu, 12 Jul 2018 15:13:14 +0300
Subject: devlink: Add support for region snapshot read command

Add support for DEVLINK_CMD_REGION_READ_GET used for both reading
and dumping region data. Read allows reading from a region specific
address for given length. Dump allows reading the full region.
If only snapshot ID is provided a snapshot dump will be done.
If snapshot ID, Address and Length are provided a snapshot read
will done.

This is used for both snapshot access and will be used in the same
way to access current data on the region.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/devlink.h |   7 ++
 net/core/devlink.c           | 182 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 189 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index d212e02f843f..79407bbd296d 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -87,6 +87,7 @@ enum devlink_command {
 	DEVLINK_CMD_REGION_SET,
 	DEVLINK_CMD_REGION_NEW,
 	DEVLINK_CMD_REGION_DEL,
+	DEVLINK_CMD_REGION_READ,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
@@ -273,6 +274,12 @@ enum devlink_attr {
 	DEVLINK_ATTR_REGION_SNAPSHOT,           /* nested */
 	DEVLINK_ATTR_REGION_SNAPSHOT_ID,        /* u32 */
 
+	DEVLINK_ATTR_REGION_CHUNKS,             /* nested */
+	DEVLINK_ATTR_REGION_CHUNK,              /* nested */
+	DEVLINK_ATTR_REGION_CHUNK_DATA,         /* binary */
+	DEVLINK_ATTR_REGION_CHUNK_ADDR,         /* u64 */
+	DEVLINK_ATTR_REGION_CHUNK_LEN,          /* u64 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index fc0836371a71..e5118dba6bb4 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3388,6 +3388,181 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb,
 	return 0;
 }
 
+static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
+						 struct devlink *devlink,
+						 u8 *chunk, u32 chunk_size,
+						 u64 addr)
+{
+	struct nlattr *chunk_attr;
+	int err;
+
+	chunk_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_CHUNK);
+	if (!chunk_attr)
+		return -EINVAL;
+
+	err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk);
+	if (err)
+		goto nla_put_failure;
+
+	err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr,
+				DEVLINK_ATTR_PAD);
+	if (err)
+		goto nla_put_failure;
+
+	nla_nest_end(msg, chunk_attr);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, chunk_attr);
+	return err;
+}
+
+#define DEVLINK_REGION_READ_CHUNK_SIZE 256
+
+static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
+						struct devlink *devlink,
+						struct devlink_region *region,
+						struct nlattr **attrs,
+						u64 start_offset,
+						u64 end_offset,
+						bool dump,
+						u64 *new_offset)
+{
+	struct devlink_snapshot *snapshot;
+	u64 curr_offset = start_offset;
+	u32 snapshot_id;
+	int err = 0;
+
+	*new_offset = start_offset;
+
+	snapshot_id = nla_get_u32(attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+	snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+	if (!snapshot)
+		return -EINVAL;
+
+	if (end_offset > snapshot->data_len || dump)
+		end_offset = snapshot->data_len;
+
+	while (curr_offset < end_offset) {
+		u32 data_size;
+		u8 *data;
+
+		if (end_offset - curr_offset < DEVLINK_REGION_READ_CHUNK_SIZE)
+			data_size = end_offset - curr_offset;
+		else
+			data_size = DEVLINK_REGION_READ_CHUNK_SIZE;
+
+		data = &snapshot->data[curr_offset];
+		err = devlink_nl_cmd_region_read_chunk_fill(skb, devlink,
+							    data, data_size,
+							    curr_offset);
+		if (err)
+			break;
+
+		curr_offset += data_size;
+	}
+	*new_offset = curr_offset;
+
+	return err;
+}
+
+static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
+					     struct netlink_callback *cb)
+{
+	u64 ret_offset, start_offset, end_offset = 0;
+	struct nlattr *attrs[DEVLINK_ATTR_MAX + 1];
+	const struct genl_ops *ops = cb->data;
+	struct devlink_region *region;
+	struct nlattr *chunks_attr;
+	const char *region_name;
+	struct devlink *devlink;
+	bool dump = true;
+	void *hdr;
+	int err;
+
+	start_offset = *((u64 *)&cb->args[0]);
+
+	err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize,
+			  attrs, DEVLINK_ATTR_MAX, ops->policy, NULL);
+	if (err)
+		goto out;
+
+	devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
+	if (IS_ERR(devlink))
+		goto out;
+
+	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
+
+	if (!attrs[DEVLINK_ATTR_REGION_NAME] ||
+	    !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
+		goto out_unlock;
+
+	region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]);
+	region = devlink_region_get_by_name(devlink, region_name);
+	if (!region)
+		goto out_unlock;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			  &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,
+			  DEVLINK_CMD_REGION_READ);
+	if (!hdr)
+		goto out_unlock;
+
+	err = devlink_nl_put_handle(skb, devlink);
+	if (err)
+		goto nla_put_failure;
+
+	err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
+	if (err)
+		goto nla_put_failure;
+
+	chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS);
+	if (!chunks_attr)
+		goto nla_put_failure;
+
+	if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
+	    attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
+		if (!start_offset)
+			start_offset =
+				nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+
+		end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+		end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
+		dump = false;
+	}
+
+	err = devlink_nl_region_read_snapshot_fill(skb, devlink,
+						   region, attrs,
+						   start_offset,
+						   end_offset, dump,
+						   &ret_offset);
+
+	if (err && err != -EMSGSIZE)
+		goto nla_put_failure;
+
+	/* Check if there was any progress done to prevent infinite loop */
+	if (ret_offset == start_offset)
+		goto nla_put_failure;
+
+	*((u64 *)&cb->args[0]) = ret_offset;
+
+	nla_nest_end(skb, chunks_attr);
+	genlmsg_end(skb, hdr);
+	mutex_unlock(&devlink->lock);
+	mutex_unlock(&devlink_mutex);
+
+	return skb->len;
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+out_unlock:
+	mutex_unlock(&devlink->lock);
+	mutex_unlock(&devlink_mutex);
+out:
+	return 0;
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -3626,6 +3801,13 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_REGION_READ,
+		.dumpit = devlink_nl_cmd_region_read_dumpit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3


From 4f91da26c81145f255cb153152ffed70014b1c41 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 11 Jul 2018 20:36:38 -0700
Subject: xdp: add per mode attributes for attached programs

In preparation for support of simultaneous driver and hardware XDP
support add per-mode attributes.  The catch-all IFLA_XDP_PROG_ID
will still be reported, but user space can now also access the
program ID in a new IFLA_XDP_<mode>_PROG_ID attribute.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/if_link.h |  3 +++
 net/core/rtnetlink.c         | 30 ++++++++++++++++++++++++++----
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index cf01b6824244..bc86c2b105ec 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -928,6 +928,9 @@ enum {
 	IFLA_XDP_ATTACHED,
 	IFLA_XDP_FLAGS,
 	IFLA_XDP_PROG_ID,
+	IFLA_XDP_DRV_PROG_ID,
+	IFLA_XDP_SKB_PROG_ID,
+	IFLA_XDP_HW_PROG_ID,
 	__IFLA_XDP_MAX,
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5ef61222fdef..b40242459907 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -964,7 +964,8 @@ static size_t rtnl_xdp_size(void)
 {
 	size_t xdp_size = nla_total_size(0) +	/* nest IFLA_XDP */
 			  nla_total_size(1) +	/* XDP_ATTACHED */
-			  nla_total_size(4);	/* XDP_PROG_ID */
+			  nla_total_size(4) +	/* XDP_PROG_ID */
+			  nla_total_size(4);	/* XDP_<mode>_PROG_ID */
 
 	return xdp_size;
 }
@@ -1378,16 +1379,17 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 {
+	u32 prog_attr, prog_id;
 	struct nlattr *xdp;
-	u32 prog_id;
 	int err;
+	u8 mode;
 
 	xdp = nla_nest_start(skb, IFLA_XDP);
 	if (!xdp)
 		return -EMSGSIZE;
 
-	err = nla_put_u8(skb, IFLA_XDP_ATTACHED,
-			 rtnl_xdp_attached_mode(dev, &prog_id));
+	mode = rtnl_xdp_attached_mode(dev, &prog_id);
+	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
 	if (err)
 		goto err_cancel;
 
@@ -1395,6 +1397,26 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 		err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
 		if (err)
 			goto err_cancel;
+
+		switch (mode) {
+		case XDP_ATTACHED_DRV:
+			prog_attr = IFLA_XDP_DRV_PROG_ID;
+			break;
+		case XDP_ATTACHED_SKB:
+			prog_attr = IFLA_XDP_SKB_PROG_ID;
+			break;
+		case XDP_ATTACHED_HW:
+			prog_attr = IFLA_XDP_HW_PROG_ID;
+			break;
+		case XDP_ATTACHED_NONE:
+		default:
+			err = -EINVAL;
+			goto err_cancel;
+		}
+
+		err = nla_put_u32(skb, prog_attr, prog_id);
+		if (err)
+			goto err_cancel;
 	}
 
 	nla_nest_end(skb, xdp);
-- 
cgit v1.2.3


From a25717d2b604347d9af8da81deea7b08e8c94220 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 11 Jul 2018 20:36:41 -0700
Subject: xdp: support simultaneous driver and hw XDP attachment

Split the query of HW-attached program from the software one.
Introduce new .ndo_bpf command to query HW-attached program.
This will allow drivers to install different programs in HW
and SW at the same time.  Netlink can now also carry multiple
programs on dump (in which case mode will be set to
XDP_ATTACHED_MULTI and user has to check per-attachment point
attributes, IFLA_XDP_PROG_ID will not be present).  We reuse
IFLA_XDP_PROG_ID skb space for second mode, so rtnl_xdp_size()
doesn't need to be updated.

Note that the installation side is still not there, since all
drivers currently reject installing more than one program at
the time.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  6 ++
 drivers/net/netdevsim/bpf.c                        |  6 ++
 include/linux/netdevice.h                          |  7 +-
 include/uapi/linux/if_link.h                       |  1 +
 net/core/dev.c                                     | 45 ++++++-----
 net/core/rtnetlink.c                               | 93 ++++++++++++----------
 6 files changed, 96 insertions(+), 62 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 4bb589dbffbc..bb1e72e8dbc2 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3453,6 +3453,12 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG_HW:
 		return nfp_net_xdp_setup(nn, xdp);
 	case XDP_QUERY_PROG:
+		if (nn->dp.bpf_offload_xdp)
+			return 0;
+		return xdp_attachment_query(&nn->xdp, xdp);
+	case XDP_QUERY_PROG_HW:
+		if (!nn->dp.bpf_offload_xdp)
+			return 0;
 		return xdp_attachment_query(&nn->xdp, xdp);
 	default:
 		return nfp_app_bpf(nn->app, nn, xdp);
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index c485d97b5df4..5544c9b51173 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -561,6 +561,12 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 		nsim_bpf_destroy_prog(bpf->offload.prog);
 		return 0;
 	case XDP_QUERY_PROG:
+		if (ns->xdp_prog_mode != XDP_ATTACHED_DRV)
+			return 0;
+		return xdp_attachment_query(&ns->xdp, bpf);
+	case XDP_QUERY_PROG_HW:
+		if (ns->xdp_prog_mode != XDP_ATTACHED_HW)
+			return 0;
 		return xdp_attachment_query(&ns->xdp, bpf);
 	case XDP_SETUP_PROG:
 		err = nsim_setup_prog_checks(ns, bpf);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 69a664789b33..2422c0e88f5c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -820,6 +820,7 @@ enum bpf_netdev_command {
 	XDP_SETUP_PROG,
 	XDP_SETUP_PROG_HW,
 	XDP_QUERY_PROG,
+	XDP_QUERY_PROG_HW,
 	/* BPF program for offload callbacks, invoked at program load time. */
 	BPF_OFFLOAD_VERIFIER_PREP,
 	BPF_OFFLOAD_TRANSLATE,
@@ -843,7 +844,7 @@ struct netdev_bpf {
 			struct bpf_prog *prog;
 			struct netlink_ext_ack *extack;
 		};
-		/* XDP_QUERY_PROG */
+		/* XDP_QUERY_PROG, XDP_QUERY_PROG_HW */
 		struct {
 			u32 prog_id;
 			/* flags with which program was installed */
@@ -3533,8 +3534,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags);
-void __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
-		     struct netdev_bpf *xdp);
+u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
+		    enum bpf_netdev_command cmd);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index bc86c2b105ec..8759cfb8aa2e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -920,6 +920,7 @@ enum {
 	XDP_ATTACHED_DRV,
 	XDP_ATTACHED_SKB,
 	XDP_ATTACHED_HW,
+	XDP_ATTACHED_MULTI,
 };
 
 enum {
diff --git a/net/core/dev.c b/net/core/dev.c
index 9fa3b3705a8e..993cdc3cd086 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7582,21 +7582,19 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
-void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
-		     struct netdev_bpf *xdp)
+u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+		    enum bpf_netdev_command cmd)
 {
-	memset(xdp, 0, sizeof(*xdp));
-	xdp->command = XDP_QUERY_PROG;
+	struct netdev_bpf xdp;
 
-	/* Query must always succeed. */
-	WARN_ON(bpf_op(dev, xdp) < 0);
-}
+	if (!bpf_op)
+		return 0;
 
-static bool __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
-{
-	struct netdev_bpf xdp;
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = cmd;
 
-	__dev_xdp_query(dev, bpf_op, &xdp);
+	/* Query must always succeed. */
+	WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
 
 	return xdp.prog_id;
 }
@@ -7632,12 +7630,19 @@ static void dev_xdp_uninstall(struct net_device *dev)
 	if (!ndo_bpf)
 		return;
 
-	__dev_xdp_query(dev, ndo_bpf, &xdp);
-	if (!xdp.prog_id)
-		return;
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_QUERY_PROG;
+	WARN_ON(ndo_bpf(dev, &xdp));
+	if (xdp.prog_id)
+		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+					NULL));
 
-	/* Program removal should always succeed */
-	WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
+	/* Remove HW offload */
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_QUERY_PROG_HW;
+	if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
+		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+					NULL));
 }
 
 /**
@@ -7653,12 +7658,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
+	enum bpf_netdev_command query;
 	struct bpf_prog *prog = NULL;
 	bpf_op_t bpf_op, bpf_chk;
 	int err;
 
 	ASSERT_RTNL();
 
+	query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
+
 	bpf_op = bpf_chk = ops->ndo_bpf;
 	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
 		return -EOPNOTSUPP;
@@ -7668,10 +7676,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		bpf_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
+		if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
+		    __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
 			return -EEXIST;
 		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
-		    __dev_xdp_attached(dev, bpf_op))
+		    __dev_xdp_query(dev, bpf_op, query))
 			return -EBUSY;
 
 		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 02ebc056a688..c9929ef17539 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -964,7 +964,7 @@ static size_t rtnl_xdp_size(void)
 {
 	size_t xdp_size = nla_total_size(0) +	/* nest IFLA_XDP */
 			  nla_total_size(1) +	/* XDP_ATTACHED */
-			  nla_total_size(4) +	/* XDP_PROG_ID */
+			  nla_total_size(4) +	/* XDP_PROG_ID (or 1st mode) */
 			  nla_total_size(4);	/* XDP_<mode>_PROG_ID */
 
 	return xdp_size;
@@ -1354,37 +1354,57 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
-static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
+static u32 rtnl_xdp_prog_skb(struct net_device *dev)
 {
-	const struct net_device_ops *ops = dev->netdev_ops;
 	const struct bpf_prog *generic_xdp_prog;
-	struct netdev_bpf xdp;
 
 	ASSERT_RTNL();
 
-	*prog_id = 0;
 	generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
-	if (generic_xdp_prog) {
-		*prog_id = generic_xdp_prog->aux->id;
-		return XDP_ATTACHED_SKB;
-	}
-	if (!ops->ndo_bpf)
-		return XDP_ATTACHED_NONE;
+	if (!generic_xdp_prog)
+		return 0;
+	return generic_xdp_prog->aux->id;
+}
+
+static u32 rtnl_xdp_prog_drv(struct net_device *dev)
+{
+	return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
+}
+
+static u32 rtnl_xdp_prog_hw(struct net_device *dev)
+{
+	return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf,
+			       XDP_QUERY_PROG_HW);
+}
+
+static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
+			       u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr,
+			       u32 (*get_prog_id)(struct net_device *dev))
+{
+	u32 curr_id;
+	int err;
+
+	curr_id = get_prog_id(dev);
+	if (!curr_id)
+		return 0;
+
+	*prog_id = curr_id;
+	err = nla_put_u32(skb, attr, curr_id);
+	if (err)
+		return err;
 
-	__dev_xdp_query(dev, ops->ndo_bpf, &xdp);
-	if (!xdp.prog_id)
-		return XDP_ATTACHED_NONE;
+	if (*mode != XDP_ATTACHED_NONE)
+		*mode = XDP_ATTACHED_MULTI;
+	else
+		*mode = tgt_mode;
 
-	*prog_id = xdp.prog_id;
-	if (xdp.prog_flags & XDP_FLAGS_HW_MODE)
-		return XDP_ATTACHED_HW;
-	return XDP_ATTACHED_DRV;
+	return 0;
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 {
-	u32 prog_attr, prog_id;
 	struct nlattr *xdp;
+	u32 prog_id;
 	int err;
 	u8 mode;
 
@@ -1392,35 +1412,26 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 	if (!xdp)
 		return -EMSGSIZE;
 
-	mode = rtnl_xdp_attached_mode(dev, &prog_id);
+	prog_id = 0;
+	mode = XDP_ATTACHED_NONE;
+	if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB,
+				IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb))
+		goto err_cancel;
+	if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV,
+				IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv))
+		goto err_cancel;
+	if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW,
+				IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw))
+		goto err_cancel;
+
 	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
 	if (err)
 		goto err_cancel;
 
-	if (prog_id) {
+	if (prog_id && mode != XDP_ATTACHED_MULTI) {
 		err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
 		if (err)
 			goto err_cancel;
-
-		switch (mode) {
-		case XDP_ATTACHED_DRV:
-			prog_attr = IFLA_XDP_DRV_PROG_ID;
-			break;
-		case XDP_ATTACHED_SKB:
-			prog_attr = IFLA_XDP_SKB_PROG_ID;
-			break;
-		case XDP_ATTACHED_HW:
-			prog_attr = IFLA_XDP_HW_PROG_ID;
-			break;
-		case XDP_ATTACHED_NONE:
-		default:
-			err = -EINVAL;
-			goto err_cancel;
-		}
-
-		err = nla_put_u32(skb, prog_attr, prog_id);
-		if (err)
-			goto err_cancel;
 	}
 
 	nla_nest_end(skb, xdp);
-- 
cgit v1.2.3


From c921c2077b32081617789a645120148bc8b60c98 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 13 Jul 2018 12:16:43 +0300
Subject: net: ipmr: add support for passing full packet on wrong vif

This patch adds support for IGMPMSG_WRVIFWHOLE which is used to pass
full packet and real vif id when the incoming interface is wrong.
While the RP and FHR are setting up state we need to be sending the
registers encapsulated with all the data inside otherwise we lose it.
The RP then decapsulates it and forwards it to the interested parties.
Currently with WRONGVIF we can only be sending empty register packets
and will lose that data.
This behaviour can be enabled by using MRT_PIM with
val == IGMPMSG_WRVIFWHOLE. This doesn't prevent IGMPMSG_WRONGVIF from
happening, it happens in addition to it, also it is controlled by the same
throttling parameters as WRONGVIF (i.e. 1 packet per 3 seconds currently).
Both messages are generated to keep backwards compatibily and avoid
breaking someone who was enabling MRT_PIM with val == 4, since any
positive val is accepted and treated the same.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute_base.h |  1 +
 include/uapi/linux/mroute.h |  2 ++
 net/ipv4/ipmr.c             | 21 ++++++++++++++++-----
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index fd436cdd4725..6675b9f81979 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -254,6 +254,7 @@ struct mr_table {
 	atomic_t		cache_resolve_queue_len;
 	bool			mroute_do_assert;
 	bool			mroute_do_pim;
+	bool			mroute_do_wrvifwhole;
 	int			mroute_reg_vif_num;
 };
 
diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h
index 10f9ff9426a2..5d37a9ccce63 100644
--- a/include/uapi/linux/mroute.h
+++ b/include/uapi/linux/mroute.h
@@ -120,6 +120,7 @@ enum {
 	IPMRA_TABLE_MROUTE_DO_ASSERT,
 	IPMRA_TABLE_MROUTE_DO_PIM,
 	IPMRA_TABLE_VIFS,
+	IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
 	__IPMRA_TABLE_MAX
 };
 #define IPMRA_TABLE_MAX (__IPMRA_TABLE_MAX - 1)
@@ -173,5 +174,6 @@ enum {
 #define IGMPMSG_NOCACHE		1		/* Kern cache fill request to mrouted */
 #define IGMPMSG_WRONGVIF	2		/* For PIM assert processing (unused) */
 #define IGMPMSG_WHOLEPKT	3		/* For PIM Register processing */
+#define IGMPMSG_WRVIFWHOLE	4		/* For PIM Register and assert processing */
 
 #endif /* _UAPI__LINUX_MROUTE_H */
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 82f914122f1b..5660adcf7a04 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1052,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
 	struct sk_buff *skb;
 	int ret;
 
-	if (assert == IGMPMSG_WHOLEPKT)
+	if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
 	else
 		skb = alloc_skb(128, GFP_ATOMIC);
@@ -1060,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
 	if (!skb)
 		return -ENOBUFS;
 
-	if (assert == IGMPMSG_WHOLEPKT) {
+	if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) {
 		/* Ugly, but we have no choice with this interface.
 		 * Duplicate old header, fix ihl, length etc.
 		 * And all this only to mangle msg->im_msgtype and
@@ -1071,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt,
 		skb_reset_transport_header(skb);
 		msg = (struct igmpmsg *)skb_network_header(skb);
 		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
-		msg->im_msgtype = IGMPMSG_WHOLEPKT;
+		msg->im_msgtype = assert;
 		msg->im_mbz = 0;
-		msg->im_vif = mrt->mroute_reg_vif_num;
+		if (assert == IGMPMSG_WRVIFWHOLE)
+			msg->im_vif = vifi;
+		else
+			msg->im_vif = mrt->mroute_reg_vif_num;
 		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
 		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
 					     sizeof(struct iphdr));
@@ -1372,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
 	struct mr_table *mrt;
 	struct vifctl vif;
 	struct mfcctl mfc;
+	bool do_wrvifwhole;
 	u32 uval;
 
 	/* There's one exception to the lock - MRT_DONE which needs to unlock */
@@ -1502,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
 			break;
 		}
 
+		do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE);
 		val = !!val;
 		if (val != mrt->mroute_do_pim) {
 			mrt->mroute_do_pim = val;
 			mrt->mroute_do_assert = val;
+			mrt->mroute_do_wrvifwhole = do_wrvifwhole;
 		}
 		break;
 	case MRT_TABLE:
@@ -1983,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 			       MFC_ASSERT_THRESH)) {
 			c->_c.mfc_un.res.last_assert = jiffies;
 			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+			if (mrt->mroute_do_wrvifwhole)
+				ipmr_cache_report(mrt, skb, true_vifi,
+						  IGMPMSG_WRVIFWHOLE);
 		}
 		goto dont_forward;
 	}
@@ -2659,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
 			mrt->mroute_reg_vif_num) ||
 	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
 		       mrt->mroute_do_assert) ||
-	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim))
+	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) ||
+	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
+		       mrt->mroute_do_wrvifwhole))
 		return false;
 
 	return true;
-- 
cgit v1.2.3


From c3086637b0d7dbee0925697f8dbee2bcf9637b9f Mon Sep 17 00:00:00 2001
From: Michael Heimpold <mhei@heimpold.de>
Date: Wed, 11 Jul 2018 23:10:55 +0200
Subject: net: ethtool: fix spelling mistake: "tubale" -> "tunable"

Signed-off-by: Michael Heimpold <mhei@heimpold.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 4ca65b56084f..7363f18e65a5 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -226,7 +226,7 @@ enum tunable_id {
 	ETHTOOL_TX_COPYBREAK,
 	ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */
 	/*
-	 * Add your fresh new tubale attribute above and remember to update
+	 * Add your fresh new tunable attribute above and remember to update
 	 * tunable_strings[] in net/core/ethtool.c
 	 */
 	__ETHTOOL_TUNABLE_COUNT,
-- 
cgit v1.2.3


From f333ee0cdb27ba201e6cc0c99c76b1364aa29b86 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 11 Jul 2018 17:33:32 -0700
Subject: bpf: Add BPF_SOCK_OPS_TCP_LISTEN_CB

Add new TCP-BPF callback that is called on listen(2) right after socket
transition to TCP_LISTEN state.

It fills the gap for listening sockets in TCP-BPF. For example BPF
program can set BPF_SOCK_OPS_STATE_CB_FLAG when socket becomes listening
and track later transition from TCP_LISTEN to TCP_CLOSE with
BPF_SOCK_OPS_STATE_CB callback.

Before there was no way to do it with TCP-BPF and other options were
much harder to work with. E.g. socket state tracking can be done with
tracepoints (either raw or regular) but they can't be attached to cgroup
and their lifetime has to be managed separately.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 3 +++
 net/ipv4/af_inet.c       | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6bcb287a888d..870113916cac 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2555,6 +2555,9 @@ enum {
 					 * Arg1: old_state
 					 * Arg2: new_state
 					 */
+	BPF_SOCK_OPS_TCP_LISTEN_CB,	/* Called on listen(2), right after
+					 * socket transition to LISTEN state.
+					 */
 };
 
 /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c716be13d58c..f2a0a3bab6b5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog)
 		err = inet_csk_listen_start(sk, backlog);
 		if (err)
 			goto out;
+		tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
 	}
 	sk->sk_max_ack_backlog = backlog;
 	err = 0;
-- 
cgit v1.2.3


From a7fe68a1e8e4bce007505f729bc33e427c540386 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Sat, 14 Jul 2018 19:05:58 -0400
Subject: drm/amd: Add CU-masking ioctl definition to kfd_ioctl.h

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 include/uapi/linux/kfd_ioctl.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 88d17c39dbf9..01674b56e14f 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -76,6 +76,12 @@ struct kfd_ioctl_update_queue_args {
 	__u32 queue_priority;	/* to KFD */
 };
 
+struct kfd_ioctl_set_cu_mask_args {
+	__u32 queue_id;		/* to KFD */
+	__u32 num_cu_mask;		/* to KFD */
+	__u64 cu_mask_ptr;		/* to KFD */
+};
+
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
 #define KFD_IOC_CACHE_POLICY_COHERENT 0
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
@@ -466,7 +472,10 @@ struct kfd_ioctl_unmap_memory_from_gpu_args {
 #define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU	\
 		AMDKFD_IOWR(0x19, struct kfd_ioctl_unmap_memory_from_gpu_args)
 
+#define AMDKFD_IOC_SET_CU_MASK		\
+		AMDKFD_IOW(0x1A, struct kfd_ioctl_set_cu_mask_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x1A
+#define AMDKFD_COMMAND_END		0x1B
 
 #endif
-- 
cgit v1.2.3


From 620e1902f6fe57ddacdabd9e33fadbd290be9652 Mon Sep 17 00:00:00 2001
From: Wu Hao <hao.wu@intel.com>
Date: Sat, 30 Jun 2018 08:53:23 +0800
Subject: fpga: dfl: fme: add DFL_FPGA_GET_API_VERSION/CHECK_EXTENSION ioctls
 support

DFL_FPGA_GET_API_VERSION and DFL_FPGA_CHECK_EXTENSION ioctls are common
ones which need to be supported by all feature devices drivers including
FME and AFU. Userspace application can use these ioctl interfaces to get
the API info and check if specific extension is supported or not in
current driver.

This patch implements above 2 ioctls in FPGA Management Engine (FME)
driver.

Signed-off-by: Tim Whisonant <tim.whisonant@intel.com>
Signed-off-by: Enno Luebbers <enno.luebbers@intel.com>
Signed-off-by: Shiva Rao <shiva.rao@intel.com>
Signed-off-by: Christopher Rauer <christopher.rauer@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Acked-by: Moritz Fischer <mdf@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ioctl/ioctl-number.txt |  1 +
 drivers/fpga/dfl-fme-main.c          | 12 +++++++++
 include/uapi/linux/fpga-dfl.h        | 50 ++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+)
 create mode 100644 include/uapi/linux/fpga-dfl.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 480c8609dc58..db9afea24299 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -322,6 +322,7 @@ Code  Seq#(hex)	Include File		Comments
 0xB3	00	linux/mmc/ioctl.h
 0xB4	00-0F	linux/gpio.h		<mailto:linux-gpio@vger.kernel.org>
 0xB5	00-0F	uapi/linux/rpmsg.h	<mailto:linux-remoteproc@vger.kernel.org>
+0xB6	all	linux/fpga-dfl.h
 0xC0	00-0F	linux/usb/iowarrior.h
 0xCA	00-0F	uapi/misc/cxl.h
 0xCA	10-2F	uapi/misc/ocxl.h
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index c23c56fe3f4b..c83ff88e3bbb 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -16,6 +16,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/fpga-dfl.h>
 
 #include "dfl.h"
 
@@ -116,6 +117,13 @@ static struct dfl_feature_driver fme_feature_drvs[] = {
 	},
 };
 
+static long fme_ioctl_check_extension(struct dfl_feature_platform_data *pdata,
+				      unsigned long arg)
+{
+	/* No extension support for now */
+	return 0;
+}
+
 static int fme_open(struct inode *inode, struct file *filp)
 {
 	struct platform_device *fdev = dfl_fpga_inode_to_feature_dev(inode);
@@ -156,6 +164,10 @@ static long fme_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	dev_dbg(&pdev->dev, "%s cmd 0x%x\n", __func__, cmd);
 
 	switch (cmd) {
+	case DFL_FPGA_GET_API_VERSION:
+		return DFL_FPGA_API_VERSION;
+	case DFL_FPGA_CHECK_EXTENSION:
+		return fme_ioctl_check_extension(pdata, arg);
 	default:
 		/*
 		 * Let sub-feature's ioctl function to handle the cmd.
diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h
new file mode 100644
index 000000000000..858e4437c31c
--- /dev/null
+++ b/include/uapi/linux/fpga-dfl.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Header File for FPGA DFL User API
+ *
+ * Copyright (C) 2017-2018 Intel Corporation, Inc.
+ *
+ * Authors:
+ *   Kang Luwei <luwei.kang@intel.com>
+ *   Zhang Yi <yi.z.zhang@intel.com>
+ *   Wu Hao <hao.wu@intel.com>
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ */
+
+#ifndef _UAPI_LINUX_FPGA_DFL_H
+#define _UAPI_LINUX_FPGA_DFL_H
+
+#include <linux/ioctl.h>
+
+#define DFL_FPGA_API_VERSION 0
+
+/*
+ * The IOCTL interface for DFL based FPGA is designed for extensibility by
+ * embedding the structure length (argsz) and flags into structures passed
+ * between kernel and userspace. This design referenced the VFIO IOCTL
+ * interface (include/uapi/linux/vfio.h).
+ */
+
+#define DFL_FPGA_MAGIC 0xB6
+
+#define DFL_FPGA_BASE 0
+
+/**
+ * DFL_FPGA_GET_API_VERSION - _IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 0)
+ *
+ * Report the version of the driver API.
+ * Return: Driver API Version.
+ */
+
+#define DFL_FPGA_GET_API_VERSION	_IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 0)
+
+/**
+ * DFL_FPGA_CHECK_EXTENSION - _IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 1)
+ *
+ * Check whether an extension is supported.
+ * Return: 0 if not supported, otherwise the extension is supported.
+ */
+
+#define DFL_FPGA_CHECK_EXTENSION	_IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 1)
+
+#endif /* _UAPI_LINUX_FPGA_DFL_H */
-- 
cgit v1.2.3


From 29de76240e861d52b75405166337e94184f1875d Mon Sep 17 00:00:00 2001
From: Kang Luwei <luwei.kang@intel.com>
Date: Sat, 30 Jun 2018 08:53:24 +0800
Subject: fpga: dfl: fme: add partial reconfiguration sub feature support

Partial Reconfiguration (PR) is the most important function for FME. It
allows reconfiguration for given Port/Accelerated Function Unit (AFU).

It creates platform devices for fpga-mgr, fpga-regions and fpga-bridges,
and invokes fpga-region's interface (fpga_region_program_fpga) for PR
operation once PR request received via ioctl. Below user space interface
is exposed by this sub feature.

Ioctl interface:
* DFL_FPGA_FME_PORT_PR
  Do partial reconfiguration per information from userspace, including
  target port(AFU), buffer size and address info. It returns error code
  to userspace if failed. For detailed PR error information, user needs
  to read fpga-mgr's status sysfs interface.

Signed-off-by: Tim Whisonant <tim.whisonant@intel.com>
Signed-off-by: Enno Luebbers <enno.luebbers@intel.com>
Signed-off-by: Shiva Rao <shiva.rao@intel.com>
Signed-off-by: Christopher Rauer <christopher.rauer@intel.com>
Signed-off-by: Kang Luwei <luwei.kang@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/Makefile         |   2 +-
 drivers/fpga/dfl-fme-main.c   |  43 +++-
 drivers/fpga/dfl-fme-pr.c     | 479 ++++++++++++++++++++++++++++++++++++++++++
 drivers/fpga/dfl-fme-pr.h     |  84 ++++++++
 drivers/fpga/dfl-fme.h        |  38 ++++
 include/uapi/linux/fpga-dfl.h |  27 +++
 6 files changed, 671 insertions(+), 2 deletions(-)
 create mode 100644 drivers/fpga/dfl-fme-pr.c
 create mode 100644 drivers/fpga/dfl-fme-pr.h
 create mode 100644 drivers/fpga/dfl-fme.h

(limited to 'include/uapi/linux')

diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
index db11f340ba0f..fd334d40aa1c 100644
--- a/drivers/fpga/Makefile
+++ b/drivers/fpga/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_OF_FPGA_REGION)		+= of-fpga-region.o
 obj-$(CONFIG_FPGA_DFL)			+= dfl.o
 obj-$(CONFIG_FPGA_DFL_FME)		+= dfl-fme.o
 
-dfl-fme-objs := dfl-fme-main.o
+dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
 
 # Drivers for FPGAs which implement DFL
 obj-$(CONFIG_FPGA_DFL_PCI)		+= dfl-pci.o
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index c83ff88e3bbb..086ad2420ade 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -19,6 +19,7 @@
 #include <linux/fpga-dfl.h>
 
 #include "dfl.h"
+#include "dfl-fme.h"
 
 static ssize_t ports_num_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
@@ -112,6 +113,10 @@ static struct dfl_feature_driver fme_feature_drvs[] = {
 		.id = FME_FEATURE_ID_HEADER,
 		.ops = &fme_hdr_ops,
 	},
+	{
+		.id = FME_FEATURE_ID_PR_MGMT,
+		.ops = &pr_mgmt_ops,
+	},
 	{
 		.ops = NULL,
 	},
@@ -187,6 +192,35 @@ static long fme_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	return -EINVAL;
 }
 
+static int fme_dev_init(struct platform_device *pdev)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	struct dfl_fme *fme;
+
+	fme = devm_kzalloc(&pdev->dev, sizeof(*fme), GFP_KERNEL);
+	if (!fme)
+		return -ENOMEM;
+
+	fme->pdata = pdata;
+
+	mutex_lock(&pdata->lock);
+	dfl_fpga_pdata_set_private(pdata, fme);
+	mutex_unlock(&pdata->lock);
+
+	return 0;
+}
+
+static void fme_dev_destroy(struct platform_device *pdev)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	struct dfl_fme *fme;
+
+	mutex_lock(&pdata->lock);
+	fme = dfl_fpga_pdata_get_private(pdata);
+	dfl_fpga_pdata_set_private(pdata, NULL);
+	mutex_unlock(&pdata->lock);
+}
+
 static const struct file_operations fme_fops = {
 	.owner		= THIS_MODULE,
 	.open		= fme_open,
@@ -198,10 +232,14 @@ static int fme_probe(struct platform_device *pdev)
 {
 	int ret;
 
-	ret = dfl_fpga_dev_feature_init(pdev, fme_feature_drvs);
+	ret = fme_dev_init(pdev);
 	if (ret)
 		goto exit;
 
+	ret = dfl_fpga_dev_feature_init(pdev, fme_feature_drvs);
+	if (ret)
+		goto dev_destroy;
+
 	ret = dfl_fpga_dev_ops_register(pdev, &fme_fops, THIS_MODULE);
 	if (ret)
 		goto feature_uinit;
@@ -210,6 +248,8 @@ static int fme_probe(struct platform_device *pdev)
 
 feature_uinit:
 	dfl_fpga_dev_feature_uinit(pdev);
+dev_destroy:
+	fme_dev_destroy(pdev);
 exit:
 	return ret;
 }
@@ -218,6 +258,7 @@ static int fme_remove(struct platform_device *pdev)
 {
 	dfl_fpga_dev_ops_unregister(pdev);
 	dfl_fpga_dev_feature_uinit(pdev);
+	fme_dev_destroy(pdev);
 
 	return 0;
 }
diff --git a/drivers/fpga/dfl-fme-pr.c b/drivers/fpga/dfl-fme-pr.c
new file mode 100644
index 000000000000..fc9fd2d0482f
--- /dev/null
+++ b/drivers/fpga/dfl-fme-pr.c
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for FPGA Management Engine (FME) Partial Reconfiguration
+ *
+ * Copyright (C) 2017-2018 Intel Corporation, Inc.
+ *
+ * Authors:
+ *   Kang Luwei <luwei.kang@intel.com>
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *   Wu Hao <hao.wu@intel.com>
+ *   Joseph Grecco <joe.grecco@intel.com>
+ *   Enno Luebbers <enno.luebbers@intel.com>
+ *   Tim Whisonant <tim.whisonant@intel.com>
+ *   Ananda Ravuri <ananda.ravuri@intel.com>
+ *   Christopher Rauer <christopher.rauer@intel.com>
+ *   Henry Mitchel <henry.mitchel@intel.com>
+ */
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <linux/fpga/fpga-mgr.h>
+#include <linux/fpga/fpga-bridge.h>
+#include <linux/fpga/fpga-region.h>
+#include <linux/fpga-dfl.h>
+
+#include "dfl.h"
+#include "dfl-fme.h"
+#include "dfl-fme-pr.h"
+
+static struct dfl_fme_region *
+dfl_fme_region_find_by_port_id(struct dfl_fme *fme, int port_id)
+{
+	struct dfl_fme_region *fme_region;
+
+	list_for_each_entry(fme_region, &fme->region_list, node)
+		if (fme_region->port_id == port_id)
+			return fme_region;
+
+	return NULL;
+}
+
+static int dfl_fme_region_match(struct device *dev, const void *data)
+{
+	return dev->parent == data;
+}
+
+static struct fpga_region *dfl_fme_region_find(struct dfl_fme *fme, int port_id)
+{
+	struct dfl_fme_region *fme_region;
+	struct fpga_region *region;
+
+	fme_region = dfl_fme_region_find_by_port_id(fme, port_id);
+	if (!fme_region)
+		return NULL;
+
+	region = fpga_region_class_find(NULL, &fme_region->region->dev,
+					dfl_fme_region_match);
+	if (!region)
+		return NULL;
+
+	return region;
+}
+
+static int fme_pr(struct platform_device *pdev, unsigned long arg)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	void __user *argp = (void __user *)arg;
+	struct dfl_fpga_fme_port_pr port_pr;
+	struct fpga_image_info *info;
+	struct fpga_region *region;
+	void __iomem *fme_hdr;
+	struct dfl_fme *fme;
+	unsigned long minsz;
+	void *buf = NULL;
+	int ret = 0;
+	u64 v;
+
+	minsz = offsetofend(struct dfl_fpga_fme_port_pr, buffer_address);
+
+	if (copy_from_user(&port_pr, argp, minsz))
+		return -EFAULT;
+
+	if (port_pr.argsz < minsz || port_pr.flags)
+		return -EINVAL;
+
+	if (!IS_ALIGNED(port_pr.buffer_size, 4))
+		return -EINVAL;
+
+	/* get fme header region */
+	fme_hdr = dfl_get_feature_ioaddr_by_id(&pdev->dev,
+					       FME_FEATURE_ID_HEADER);
+
+	/* check port id */
+	v = readq(fme_hdr + FME_HDR_CAP);
+	if (port_pr.port_id >= FIELD_GET(FME_CAP_NUM_PORTS, v)) {
+		dev_dbg(&pdev->dev, "port number more than maximum\n");
+		return -EINVAL;
+	}
+
+	if (!access_ok(VERIFY_READ,
+		       (void __user *)(unsigned long)port_pr.buffer_address,
+		       port_pr.buffer_size))
+		return -EFAULT;
+
+	buf = vmalloc(port_pr.buffer_size);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf,
+			   (void __user *)(unsigned long)port_pr.buffer_address,
+			   port_pr.buffer_size)) {
+		ret = -EFAULT;
+		goto free_exit;
+	}
+
+	/* prepare fpga_image_info for PR */
+	info = fpga_image_info_alloc(&pdev->dev);
+	if (!info) {
+		ret = -ENOMEM;
+		goto free_exit;
+	}
+
+	info->flags |= FPGA_MGR_PARTIAL_RECONFIG;
+
+	mutex_lock(&pdata->lock);
+	fme = dfl_fpga_pdata_get_private(pdata);
+	/* fme device has been unregistered. */
+	if (!fme) {
+		ret = -EINVAL;
+		goto unlock_exit;
+	}
+
+	region = dfl_fme_region_find(fme, port_pr.port_id);
+	if (!region) {
+		ret = -EINVAL;
+		goto unlock_exit;
+	}
+
+	fpga_image_info_free(region->info);
+
+	info->buf = buf;
+	info->count = port_pr.buffer_size;
+	info->region_id = port_pr.port_id;
+	region->info = info;
+
+	ret = fpga_region_program_fpga(region);
+
+	/*
+	 * it allows userspace to reset the PR region's logic by disabling and
+	 * reenabling the bridge to clear things out between accleration runs.
+	 * so no need to hold the bridges after partial reconfiguration.
+	 */
+	if (region->get_bridges)
+		fpga_bridges_put(&region->bridge_list);
+
+	put_device(&region->dev);
+unlock_exit:
+	mutex_unlock(&pdata->lock);
+free_exit:
+	vfree(buf);
+	if (copy_to_user((void __user *)arg, &port_pr, minsz))
+		return -EFAULT;
+
+	return ret;
+}
+
+/**
+ * dfl_fme_create_mgr - create fpga mgr platform device as child device
+ *
+ * @pdata: fme platform_device's pdata
+ *
+ * Return: mgr platform device if successful, and error code otherwise.
+ */
+static struct platform_device *
+dfl_fme_create_mgr(struct dfl_feature_platform_data *pdata,
+		   struct dfl_feature *feature)
+{
+	struct platform_device *mgr, *fme = pdata->dev;
+	struct dfl_fme_mgr_pdata mgr_pdata;
+	int ret = -ENOMEM;
+
+	if (!feature->ioaddr)
+		return ERR_PTR(-ENODEV);
+
+	mgr_pdata.ioaddr = feature->ioaddr;
+
+	/*
+	 * Each FME has only one fpga-mgr, so allocate platform device using
+	 * the same FME platform device id.
+	 */
+	mgr = platform_device_alloc(DFL_FPGA_FME_MGR, fme->id);
+	if (!mgr)
+		return ERR_PTR(ret);
+
+	mgr->dev.parent = &fme->dev;
+
+	ret = platform_device_add_data(mgr, &mgr_pdata, sizeof(mgr_pdata));
+	if (ret)
+		goto create_mgr_err;
+
+	ret = platform_device_add(mgr);
+	if (ret)
+		goto create_mgr_err;
+
+	return mgr;
+
+create_mgr_err:
+	platform_device_put(mgr);
+	return ERR_PTR(ret);
+}
+
+/**
+ * dfl_fme_destroy_mgr - destroy fpga mgr platform device
+ * @pdata: fme platform device's pdata
+ */
+static void dfl_fme_destroy_mgr(struct dfl_feature_platform_data *pdata)
+{
+	struct dfl_fme *priv = dfl_fpga_pdata_get_private(pdata);
+
+	platform_device_unregister(priv->mgr);
+}
+
+/**
+ * dfl_fme_create_bridge - create fme fpga bridge platform device as child
+ *
+ * @pdata: fme platform device's pdata
+ * @port_id: port id for the bridge to be created.
+ *
+ * Return: bridge platform device if successful, and error code otherwise.
+ */
+static struct dfl_fme_bridge *
+dfl_fme_create_bridge(struct dfl_feature_platform_data *pdata, int port_id)
+{
+	struct device *dev = &pdata->dev->dev;
+	struct dfl_fme_br_pdata br_pdata;
+	struct dfl_fme_bridge *fme_br;
+	int ret = -ENOMEM;
+
+	fme_br = devm_kzalloc(dev, sizeof(*fme_br), GFP_KERNEL);
+	if (!fme_br)
+		return ERR_PTR(ret);
+
+	br_pdata.cdev = pdata->dfl_cdev;
+	br_pdata.port_id = port_id;
+
+	fme_br->br = platform_device_alloc(DFL_FPGA_FME_BRIDGE,
+					   PLATFORM_DEVID_AUTO);
+	if (!fme_br->br)
+		return ERR_PTR(ret);
+
+	fme_br->br->dev.parent = dev;
+
+	ret = platform_device_add_data(fme_br->br, &br_pdata, sizeof(br_pdata));
+	if (ret)
+		goto create_br_err;
+
+	ret = platform_device_add(fme_br->br);
+	if (ret)
+		goto create_br_err;
+
+	return fme_br;
+
+create_br_err:
+	platform_device_put(fme_br->br);
+	return ERR_PTR(ret);
+}
+
+/**
+ * dfl_fme_destroy_bridge - destroy fpga bridge platform device
+ * @fme_br: fme bridge to destroy
+ */
+static void dfl_fme_destroy_bridge(struct dfl_fme_bridge *fme_br)
+{
+	platform_device_unregister(fme_br->br);
+}
+
+/**
+ * dfl_fme_destroy_bridge - destroy all fpga bridge platform device
+ * @pdata: fme platform device's pdata
+ */
+static void dfl_fme_destroy_bridges(struct dfl_feature_platform_data *pdata)
+{
+	struct dfl_fme *priv = dfl_fpga_pdata_get_private(pdata);
+	struct dfl_fme_bridge *fbridge, *tmp;
+
+	list_for_each_entry_safe(fbridge, tmp, &priv->bridge_list, node) {
+		list_del(&fbridge->node);
+		dfl_fme_destroy_bridge(fbridge);
+	}
+}
+
+/**
+ * dfl_fme_create_region - create fpga region platform device as child
+ *
+ * @pdata: fme platform device's pdata
+ * @mgr: mgr platform device needed for region
+ * @br: br platform device needed for region
+ * @port_id: port id
+ *
+ * Return: fme region if successful, and error code otherwise.
+ */
+static struct dfl_fme_region *
+dfl_fme_create_region(struct dfl_feature_platform_data *pdata,
+		      struct platform_device *mgr,
+		      struct platform_device *br, int port_id)
+{
+	struct dfl_fme_region_pdata region_pdata;
+	struct device *dev = &pdata->dev->dev;
+	struct dfl_fme_region *fme_region;
+	int ret = -ENOMEM;
+
+	fme_region = devm_kzalloc(dev, sizeof(*fme_region), GFP_KERNEL);
+	if (!fme_region)
+		return ERR_PTR(ret);
+
+	region_pdata.mgr = mgr;
+	region_pdata.br = br;
+
+	/*
+	 * Each FPGA device may have more than one port, so allocate platform
+	 * device using the same port platform device id.
+	 */
+	fme_region->region = platform_device_alloc(DFL_FPGA_FME_REGION, br->id);
+	if (!fme_region->region)
+		return ERR_PTR(ret);
+
+	fme_region->region->dev.parent = dev;
+
+	ret = platform_device_add_data(fme_region->region, &region_pdata,
+				       sizeof(region_pdata));
+	if (ret)
+		goto create_region_err;
+
+	ret = platform_device_add(fme_region->region);
+	if (ret)
+		goto create_region_err;
+
+	fme_region->port_id = port_id;
+
+	return fme_region;
+
+create_region_err:
+	platform_device_put(fme_region->region);
+	return ERR_PTR(ret);
+}
+
+/**
+ * dfl_fme_destroy_region - destroy fme region
+ * @fme_region: fme region to destroy
+ */
+static void dfl_fme_destroy_region(struct dfl_fme_region *fme_region)
+{
+	platform_device_unregister(fme_region->region);
+}
+
+/**
+ * dfl_fme_destroy_regions - destroy all fme regions
+ * @pdata: fme platform device's pdata
+ */
+static void dfl_fme_destroy_regions(struct dfl_feature_platform_data *pdata)
+{
+	struct dfl_fme *priv = dfl_fpga_pdata_get_private(pdata);
+	struct dfl_fme_region *fme_region, *tmp;
+
+	list_for_each_entry_safe(fme_region, tmp, &priv->region_list, node) {
+		list_del(&fme_region->node);
+		dfl_fme_destroy_region(fme_region);
+	}
+}
+
+static int pr_mgmt_init(struct platform_device *pdev,
+			struct dfl_feature *feature)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	struct dfl_fme_region *fme_region;
+	struct dfl_fme_bridge *fme_br;
+	struct platform_device *mgr;
+	struct dfl_fme *priv;
+	void __iomem *fme_hdr;
+	int ret = -ENODEV, i = 0;
+	u64 fme_cap, port_offset;
+
+	fme_hdr = dfl_get_feature_ioaddr_by_id(&pdev->dev,
+					       FME_FEATURE_ID_HEADER);
+
+	mutex_lock(&pdata->lock);
+	priv = dfl_fpga_pdata_get_private(pdata);
+
+	/* Initialize the region and bridge sub device list */
+	INIT_LIST_HEAD(&priv->region_list);
+	INIT_LIST_HEAD(&priv->bridge_list);
+
+	/* Create fpga mgr platform device */
+	mgr = dfl_fme_create_mgr(pdata, feature);
+	if (IS_ERR(mgr)) {
+		dev_err(&pdev->dev, "fail to create fpga mgr pdev\n");
+		goto unlock;
+	}
+
+	priv->mgr = mgr;
+
+	/* Read capability register to check number of regions and bridges */
+	fme_cap = readq(fme_hdr + FME_HDR_CAP);
+	for (; i < FIELD_GET(FME_CAP_NUM_PORTS, fme_cap); i++) {
+		port_offset = readq(fme_hdr + FME_HDR_PORT_OFST(i));
+		if (!(port_offset & FME_PORT_OFST_IMP))
+			continue;
+
+		/* Create bridge for each port */
+		fme_br = dfl_fme_create_bridge(pdata, i);
+		if (IS_ERR(fme_br)) {
+			ret = PTR_ERR(fme_br);
+			goto destroy_region;
+		}
+
+		list_add(&fme_br->node, &priv->bridge_list);
+
+		/* Create region for each port */
+		fme_region = dfl_fme_create_region(pdata, mgr,
+						   fme_br->br, i);
+		if (!fme_region) {
+			ret = PTR_ERR(fme_region);
+			goto destroy_region;
+		}
+
+		list_add(&fme_region->node, &priv->region_list);
+	}
+	mutex_unlock(&pdata->lock);
+
+	return 0;
+
+destroy_region:
+	dfl_fme_destroy_regions(pdata);
+	dfl_fme_destroy_bridges(pdata);
+	dfl_fme_destroy_mgr(pdata);
+unlock:
+	mutex_unlock(&pdata->lock);
+	return ret;
+}
+
+static void pr_mgmt_uinit(struct platform_device *pdev,
+			  struct dfl_feature *feature)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	struct dfl_fme *priv;
+
+	mutex_lock(&pdata->lock);
+	priv = dfl_fpga_pdata_get_private(pdata);
+
+	dfl_fme_destroy_regions(pdata);
+	dfl_fme_destroy_bridges(pdata);
+	dfl_fme_destroy_mgr(pdata);
+	mutex_unlock(&pdata->lock);
+}
+
+static long fme_pr_ioctl(struct platform_device *pdev,
+			 struct dfl_feature *feature,
+			 unsigned int cmd, unsigned long arg)
+{
+	long ret;
+
+	switch (cmd) {
+	case DFL_FPGA_FME_PORT_PR:
+		ret = fme_pr(pdev, arg);
+		break;
+	default:
+		ret = -ENODEV;
+	}
+
+	return ret;
+}
+
+const struct dfl_feature_ops pr_mgmt_ops = {
+	.init = pr_mgmt_init,
+	.uinit = pr_mgmt_uinit,
+	.ioctl = fme_pr_ioctl,
+};
diff --git a/drivers/fpga/dfl-fme-pr.h b/drivers/fpga/dfl-fme-pr.h
new file mode 100644
index 000000000000..096a699089d3
--- /dev/null
+++ b/drivers/fpga/dfl-fme-pr.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Header file for FPGA Management Engine (FME) Partial Reconfiguration Driver
+ *
+ * Copyright (C) 2017-2018 Intel Corporation, Inc.
+ *
+ * Authors:
+ *   Kang Luwei <luwei.kang@intel.com>
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *   Wu Hao <hao.wu@intel.com>
+ *   Joseph Grecco <joe.grecco@intel.com>
+ *   Enno Luebbers <enno.luebbers@intel.com>
+ *   Tim Whisonant <tim.whisonant@intel.com>
+ *   Ananda Ravuri <ananda.ravuri@intel.com>
+ *   Henry Mitchel <henry.mitchel@intel.com>
+ */
+
+#ifndef __DFL_FME_PR_H
+#define __DFL_FME_PR_H
+
+#include <linux/platform_device.h>
+
+/**
+ * struct dfl_fme_region - FME fpga region data structure
+ *
+ * @region: platform device of the FPGA region.
+ * @node: used to link fme_region to a list.
+ * @port_id: indicate which port this region connected to.
+ */
+struct dfl_fme_region {
+	struct platform_device *region;
+	struct list_head node;
+	int port_id;
+};
+
+/**
+ * struct dfl_fme_region_pdata - platform data for FME region platform device.
+ *
+ * @mgr: platform device of the FPGA manager.
+ * @br: platform device of the FPGA bridge.
+ * @region_id: region id (same as port_id).
+ */
+struct dfl_fme_region_pdata {
+	struct platform_device *mgr;
+	struct platform_device *br;
+	int region_id;
+};
+
+/**
+ * struct dfl_fme_bridge - FME fpga bridge data structure
+ *
+ * @br: platform device of the FPGA bridge.
+ * @node: used to link fme_bridge to a list.
+ */
+struct dfl_fme_bridge {
+	struct platform_device *br;
+	struct list_head node;
+};
+
+/**
+ * struct dfl_fme_bridge_pdata - platform data for FME bridge platform device.
+ *
+ * @cdev: container device.
+ * @port_id: port id.
+ */
+struct dfl_fme_br_pdata {
+	struct dfl_fpga_cdev *cdev;
+	int port_id;
+};
+
+/**
+ * struct dfl_fme_mgr_pdata - platform data for FME manager platform device.
+ *
+ * @ioaddr: mapped io address for FME manager platform device.
+ */
+struct dfl_fme_mgr_pdata {
+	void __iomem *ioaddr;
+};
+
+#define DFL_FPGA_FME_MGR	"dfl-fme-mgr"
+#define DFL_FPGA_FME_BRIDGE	"dfl-fme-bridge"
+#define DFL_FPGA_FME_REGION	"dfl-fme-region"
+
+#endif /* __DFL_FME_PR_H */
diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
new file mode 100644
index 000000000000..5394a216c5c0
--- /dev/null
+++ b/drivers/fpga/dfl-fme.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Header file for FPGA Management Engine (FME) Driver
+ *
+ * Copyright (C) 2017-2018 Intel Corporation, Inc.
+ *
+ * Authors:
+ *   Kang Luwei <luwei.kang@intel.com>
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *   Wu Hao <hao.wu@intel.com>
+ *   Joseph Grecco <joe.grecco@intel.com>
+ *   Enno Luebbers <enno.luebbers@intel.com>
+ *   Tim Whisonant <tim.whisonant@intel.com>
+ *   Ananda Ravuri <ananda.ravuri@intel.com>
+ *   Henry Mitchel <henry.mitchel@intel.com>
+ */
+
+#ifndef __DFL_FME_H
+#define __DFL_FME_H
+
+/**
+ * struct dfl_fme - dfl fme private data
+ *
+ * @mgr: FME's FPGA manager platform device.
+ * @region_list: linked list of FME's FPGA regions.
+ * @bridge_list: linked list of FME's FPGA bridges.
+ * @pdata: fme platform device's pdata.
+ */
+struct dfl_fme {
+	struct platform_device *mgr;
+	struct list_head region_list;
+	struct list_head bridge_list;
+	struct dfl_feature_platform_data *pdata;
+};
+
+extern const struct dfl_feature_ops pr_mgmt_ops;
+
+#endif /* __DFL_FME_H */
diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h
index 858e4437c31c..9666af85a8f5 100644
--- a/include/uapi/linux/fpga-dfl.h
+++ b/include/uapi/linux/fpga-dfl.h
@@ -14,6 +14,7 @@
 #ifndef _UAPI_LINUX_FPGA_DFL_H
 #define _UAPI_LINUX_FPGA_DFL_H
 
+#include <linux/types.h>
 #include <linux/ioctl.h>
 
 #define DFL_FPGA_API_VERSION 0
@@ -28,6 +29,7 @@
 #define DFL_FPGA_MAGIC 0xB6
 
 #define DFL_FPGA_BASE 0
+#define DFL_FME_BASE 0x80
 
 /**
  * DFL_FPGA_GET_API_VERSION - _IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 0)
@@ -47,4 +49,29 @@
 
 #define DFL_FPGA_CHECK_EXTENSION	_IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 1)
 
+/* IOCTLs for FME file descriptor */
+
+/**
+ * DFL_FPGA_FME_PORT_PR - _IOW(DFL_FPGA_MAGIC, DFL_FME_BASE + 0,
+ *						struct dfl_fpga_fme_port_pr)
+ *
+ * Driver does Partial Reconfiguration based on Port ID and Buffer (Image)
+ * provided by caller.
+ * Return: 0 on success, -errno on failure.
+ * If DFL_FPGA_FME_PORT_PR returns -EIO, that indicates the HW has detected
+ * some errors during PR, under this case, the user can fetch HW error info
+ * from the status of FME's fpga manager.
+ */
+
+struct dfl_fpga_fme_port_pr {
+	/* Input */
+	__u32 argsz;		/* Structure length */
+	__u32 flags;		/* Zero for now */
+	__u32 port_id;
+	__u32 buffer_size;
+	__u64 buffer_address;	/* Userspace address to the buffer for PR */
+};
+
+#define DFL_FPGA_FME_PORT_PR	_IO(DFL_FPGA_MAGIC, DFL_FME_BASE + 0)
+
 #endif /* _UAPI_LINUX_FPGA_DFL_H */
-- 
cgit v1.2.3


From e4664c0ee4ac44993c62d10b048ab0a960691da5 Mon Sep 17 00:00:00 2001
From: Wu Hao <hao.wu@intel.com>
Date: Sat, 30 Jun 2018 08:53:32 +0800
Subject: fpga: dfl: afu: add header sub feature support

The port header register set is always present for port, it is mainly
for capability, control and status of the ports that AFU connected to.

This patch implements header sub feature support. Below user interfaces
are created by this patch.

Sysfs interface:
* /sys/class/fpga_region/<regionX>/<dfl-port.x>/id
  Read-only. Port ID.

Ioctl interface:
* DFL_FPGA_PORT_RESET
  Reset the FPGA Port and its AFU.

Signed-off-by: Tim Whisonant <tim.whisonant@intel.com>
Signed-off-by: Enno Luebbers <enno.luebbers@intel.com>
Signed-off-by: Shiva Rao <shiva.rao@intel.com>
Signed-off-by: Christopher Rauer <christopher.rauer@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-platform-dfl-port |  7 ++
 drivers/fpga/dfl-afu-main.c                       | 79 ++++++++++++++++++++++-
 include/uapi/linux/fpga-dfl.h                     | 17 +++++
 3 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/ABI/testing/sysfs-platform-dfl-port

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-port b/Documentation/ABI/testing/sysfs-platform-dfl-port
new file mode 100644
index 000000000000..cb91165f5397
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-port
@@ -0,0 +1,7 @@
+What:		/sys/bus/platform/devices/dfl-port.0/id
+Date:		June 2018
+KernelVersion:	4.19
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. It returns id of this port. One DFL FPGA device
+		may have more than one port. Userspace could use this id to
+		distinguish different ports under same FPGA device.
diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
index a38d6a825e7e..d36b3e9f3984 100644
--- a/drivers/fpga/dfl-afu-main.c
+++ b/drivers/fpga/dfl-afu-main.c
@@ -16,6 +16,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/fpga-dfl.h>
 
 #include "dfl.h"
 
@@ -87,6 +88,41 @@ static int port_disable(struct platform_device *pdev)
 	return 0;
 }
 
+/*
+ * This function resets the FPGA Port and its accelerator (AFU) by function
+ * __port_disable and __port_enable (set port soft reset bit and then clear
+ * it). Userspace can do Port reset at any time, e.g. during DMA or Partial
+ * Reconfiguration. But it should never cause any system level issue, only
+ * functional failure (e.g. DMA or PR operation failure) and be recoverable
+ * from the failure.
+ *
+ * Note: the accelerator (AFU) is not accessible when its port is in reset
+ * (disabled). Any attempts on MMIO access to AFU while in reset, will
+ * result errors reported via port error reporting sub feature (if present).
+ */
+static int __port_reset(struct platform_device *pdev)
+{
+	int ret;
+
+	ret = port_disable(pdev);
+	if (!ret)
+		port_enable(pdev);
+
+	return ret;
+}
+
+static int port_reset(struct platform_device *pdev)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	int ret;
+
+	mutex_lock(&pdata->lock);
+	ret = __port_reset(pdev);
+	mutex_unlock(&pdata->lock);
+
+	return ret;
+}
+
 static int port_get_id(struct platform_device *pdev)
 {
 	void __iomem *base;
@@ -96,23 +132,63 @@ static int port_get_id(struct platform_device *pdev)
 	return FIELD_GET(PORT_CAP_PORT_NUM, readq(base + PORT_HDR_CAP));
 }
 
+static ssize_t
+id_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int id = port_get_id(to_platform_device(dev));
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", id);
+}
+static DEVICE_ATTR_RO(id);
+
+static const struct attribute *port_hdr_attrs[] = {
+	&dev_attr_id.attr,
+	NULL,
+};
+
 static int port_hdr_init(struct platform_device *pdev,
 			 struct dfl_feature *feature)
 {
 	dev_dbg(&pdev->dev, "PORT HDR Init.\n");
 
-	return 0;
+	port_reset(pdev);
+
+	return sysfs_create_files(&pdev->dev.kobj, port_hdr_attrs);
 }
 
 static void port_hdr_uinit(struct platform_device *pdev,
 			   struct dfl_feature *feature)
 {
 	dev_dbg(&pdev->dev, "PORT HDR UInit.\n");
+
+	sysfs_remove_files(&pdev->dev.kobj, port_hdr_attrs);
+}
+
+static long
+port_hdr_ioctl(struct platform_device *pdev, struct dfl_feature *feature,
+	       unsigned int cmd, unsigned long arg)
+{
+	long ret;
+
+	switch (cmd) {
+	case DFL_FPGA_PORT_RESET:
+		if (!arg)
+			ret = port_reset(pdev);
+		else
+			ret = -EINVAL;
+		break;
+	default:
+		dev_dbg(&pdev->dev, "%x cmd not handled", cmd);
+		ret = -ENODEV;
+	}
+
+	return ret;
 }
 
 static const struct dfl_feature_ops port_hdr_ops = {
 	.init = port_hdr_init,
 	.uinit = port_hdr_uinit,
+	.ioctl = port_hdr_ioctl,
 };
 
 static struct dfl_feature_driver port_feature_drvs[] = {
@@ -154,6 +230,7 @@ static int afu_release(struct inode *inode, struct file *filp)
 
 	pdata = dev_get_platdata(&pdev->dev);
 
+	port_reset(pdev);
 	dfl_feature_dev_use_end(pdata);
 
 	return 0;
diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h
index 9666af85a8f5..e6b4dd26cc68 100644
--- a/include/uapi/linux/fpga-dfl.h
+++ b/include/uapi/linux/fpga-dfl.h
@@ -29,8 +29,11 @@
 #define DFL_FPGA_MAGIC 0xB6
 
 #define DFL_FPGA_BASE 0
+#define DFL_PORT_BASE 0x40
 #define DFL_FME_BASE 0x80
 
+/* Common IOCTLs for both FME and AFU file descriptor */
+
 /**
  * DFL_FPGA_GET_API_VERSION - _IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 0)
  *
@@ -49,6 +52,20 @@
 
 #define DFL_FPGA_CHECK_EXTENSION	_IO(DFL_FPGA_MAGIC, DFL_FPGA_BASE + 1)
 
+/* IOCTLs for AFU file descriptor */
+
+/**
+ * DFL_FPGA_PORT_RESET - _IO(DFL_FPGA_MAGIC, DFL_PORT_BASE + 0)
+ *
+ * Reset the FPGA Port and its AFU. No parameters are supported.
+ * Userspace can do Port reset at any time, e.g. during DMA or PR. But
+ * it should never cause any system level issue, only functional failure
+ * (e.g. DMA or PR operation failure) and be recoverable from the failure.
+ * Return: 0 on success, -errno of failure
+ */
+
+#define DFL_FPGA_PORT_RESET		_IO(DFL_FPGA_MAGIC, DFL_PORT_BASE + 0)
+
 /* IOCTLs for FME file descriptor */
 
 /**
-- 
cgit v1.2.3


From 857a26222ff75eecf7d701ef0e91e4fbf6efa663 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Sat, 30 Jun 2018 08:53:34 +0800
Subject: fpga: dfl: afu: add afu sub feature support

User Accelerated Function Unit sub feature exposes the MMIO region of
the AFU. After valid PR bitstream is programmed and the port is enabled,
then this MMIO region could be accessed.

This patch adds support to enumerate the AFU MMIO region and expose it
to userspace via mmap file operation. Below interfaces are exposed to user:

Sysfs interface:
* /sys/class/fpga_region/<regionX>/<dfl-port.x>/afu_id
  Read-only. Indicate which PR bitstream is programmed to this AFU.

Ioctl interfaces:
* DFL_FPGA_PORT_GET_INFO
  Provide info to userspace on the number of supported region.
  Only UAFU region is supported now.

* DFL_FPGA_PORT_GET_REGION_INFO
  Provide region information, including access permission, region size,
  offset from the start of device fd.

Signed-off-by: Tim Whisonant <tim.whisonant@intel.com>
Signed-off-by: Enno Luebbers <enno.luebbers@intel.com>
Signed-off-by: Shiva Rao <shiva.rao@intel.com>
Signed-off-by: Christopher Rauer <christopher.rauer@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-platform-dfl-port |   9 +
 drivers/fpga/Makefile                             |   2 +-
 drivers/fpga/dfl-afu-main.c                       | 219 +++++++++++++++++++++-
 drivers/fpga/dfl-afu-region.c                     | 166 ++++++++++++++++
 drivers/fpga/dfl-afu.h                            |  71 +++++++
 include/uapi/linux/fpga-dfl.h                     |  48 +++++
 6 files changed, 508 insertions(+), 7 deletions(-)
 create mode 100644 drivers/fpga/dfl-afu-region.c
 create mode 100644 drivers/fpga/dfl-afu.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-port b/Documentation/ABI/testing/sysfs-platform-dfl-port
index cb91165f5397..6a92dda517b0 100644
--- a/Documentation/ABI/testing/sysfs-platform-dfl-port
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-port
@@ -5,3 +5,12 @@ Contact:	Wu Hao <hao.wu@intel.com>
 Description:	Read-only. It returns id of this port. One DFL FPGA device
 		may have more than one port. Userspace could use this id to
 		distinguish different ports under same FPGA device.
+
+What:		/sys/bus/platform/devices/dfl-port.0/afu_id
+Date:		June 2018
+KernelVersion:	4.19
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. User can program different PR bitstreams to FPGA
+		Accelerator Function Unit (AFU) for different functions. It
+		returns uuid which could be used to identify which PR bitstream
+		is programmed in this AFU.
diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
index 1ac7749b2542..a44d50dd0b70 100644
--- a/drivers/fpga/Makefile
+++ b/drivers/fpga/Makefile
@@ -38,7 +38,7 @@ obj-$(CONFIG_FPGA_DFL_FME_REGION)	+= dfl-fme-region.o
 obj-$(CONFIG_FPGA_DFL_AFU)		+= dfl-afu.o
 
 dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
-dfl-afu-objs := dfl-afu-main.o
+dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o
 
 # Drivers for FPGAs which implement DFL
 obj-$(CONFIG_FPGA_DFL_PCI)		+= dfl-pci.o
diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
index 4074b97122e2..f67a78d7e9ad 100644
--- a/drivers/fpga/dfl-afu-main.c
+++ b/drivers/fpga/dfl-afu-main.c
@@ -16,18 +16,18 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <linux/fpga-dfl.h>
 
-#include "dfl.h"
+#include "dfl-afu.h"
 
 /**
  * port_enable - enable a port
  * @pdev: port platform device.
  *
  * Enable Port by clear the port soft reset bit, which is set by default.
- * The User AFU is unable to respond to any MMIO access while in reset.
- * port_enable function should only be used after port_disable
- * function.
+ * The AFU is unable to respond to any MMIO access while in reset.
+ * port_enable function should only be used after port_disable function.
  */
 static void port_enable(struct platform_device *pdev)
 {
@@ -191,11 +191,74 @@ static const struct dfl_feature_ops port_hdr_ops = {
 	.ioctl = port_hdr_ioctl,
 };
 
+static ssize_t
+afu_id_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev);
+	void __iomem *base;
+	u64 guidl, guidh;
+
+	base = dfl_get_feature_ioaddr_by_id(dev, PORT_FEATURE_ID_AFU);
+
+	mutex_lock(&pdata->lock);
+	if (pdata->disable_count) {
+		mutex_unlock(&pdata->lock);
+		return -EBUSY;
+	}
+
+	guidl = readq(base + GUID_L);
+	guidh = readq(base + GUID_H);
+	mutex_unlock(&pdata->lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%016llx%016llx\n", guidh, guidl);
+}
+static DEVICE_ATTR_RO(afu_id);
+
+static const struct attribute *port_afu_attrs[] = {
+	&dev_attr_afu_id.attr,
+	NULL
+};
+
+static int port_afu_init(struct platform_device *pdev,
+			 struct dfl_feature *feature)
+{
+	struct resource *res = &pdev->resource[feature->resource_index];
+	int ret;
+
+	dev_dbg(&pdev->dev, "PORT AFU Init.\n");
+
+	ret = afu_mmio_region_add(dev_get_platdata(&pdev->dev),
+				  DFL_PORT_REGION_INDEX_AFU, resource_size(res),
+				  res->start, DFL_PORT_REGION_READ |
+				  DFL_PORT_REGION_WRITE | DFL_PORT_REGION_MMAP);
+	if (ret)
+		return ret;
+
+	return sysfs_create_files(&pdev->dev.kobj, port_afu_attrs);
+}
+
+static void port_afu_uinit(struct platform_device *pdev,
+			   struct dfl_feature *feature)
+{
+	dev_dbg(&pdev->dev, "PORT AFU UInit.\n");
+
+	sysfs_remove_files(&pdev->dev.kobj, port_afu_attrs);
+}
+
+static const struct dfl_feature_ops port_afu_ops = {
+	.init = port_afu_init,
+	.uinit = port_afu_uinit,
+};
+
 static struct dfl_feature_driver port_feature_drvs[] = {
 	{
 		.id = PORT_FEATURE_ID_HEADER,
 		.ops = &port_hdr_ops,
 	},
+	{
+		.id = PORT_FEATURE_ID_AFU,
+		.ops = &port_afu_ops,
+	},
 	{
 		.ops = NULL,
 	}
@@ -243,6 +306,64 @@ static long afu_ioctl_check_extension(struct dfl_feature_platform_data *pdata,
 	return 0;
 }
 
+static long
+afu_ioctl_get_info(struct dfl_feature_platform_data *pdata, void __user *arg)
+{
+	struct dfl_fpga_port_info info;
+	struct dfl_afu *afu;
+	unsigned long minsz;
+
+	minsz = offsetofend(struct dfl_fpga_port_info, num_umsgs);
+
+	if (copy_from_user(&info, arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	mutex_lock(&pdata->lock);
+	afu = dfl_fpga_pdata_get_private(pdata);
+	info.flags = 0;
+	info.num_regions = afu->num_regions;
+	info.num_umsgs = afu->num_umsgs;
+	mutex_unlock(&pdata->lock);
+
+	if (copy_to_user(arg, &info, sizeof(info)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static long afu_ioctl_get_region_info(struct dfl_feature_platform_data *pdata,
+				      void __user *arg)
+{
+	struct dfl_fpga_port_region_info rinfo;
+	struct dfl_afu_mmio_region region;
+	unsigned long minsz;
+	long ret;
+
+	minsz = offsetofend(struct dfl_fpga_port_region_info, offset);
+
+	if (copy_from_user(&rinfo, arg, minsz))
+		return -EFAULT;
+
+	if (rinfo.argsz < minsz || rinfo.padding)
+		return -EINVAL;
+
+	ret = afu_mmio_region_get_by_index(pdata, rinfo.index, &region);
+	if (ret)
+		return ret;
+
+	rinfo.flags = region.flags;
+	rinfo.size = region.size;
+	rinfo.offset = region.offset;
+
+	if (copy_to_user(arg, &rinfo, sizeof(rinfo)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static long afu_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct platform_device *pdev = filp->private_data;
@@ -259,6 +380,10 @@ static long afu_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return DFL_FPGA_API_VERSION;
 	case DFL_FPGA_CHECK_EXTENSION:
 		return afu_ioctl_check_extension(pdata, arg);
+	case DFL_FPGA_PORT_GET_INFO:
+		return afu_ioctl_get_info(pdata, (void __user *)arg);
+	case DFL_FPGA_PORT_GET_REGION_INFO:
+		return afu_ioctl_get_region_info(pdata, (void __user *)arg);
 	default:
 		/*
 		 * Let sub-feature's ioctl function to handle the cmd
@@ -277,13 +402,83 @@ static long afu_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	return -EINVAL;
 }
 
+static int afu_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct platform_device *pdev = filp->private_data;
+	struct dfl_feature_platform_data *pdata;
+	u64 size = vma->vm_end - vma->vm_start;
+	struct dfl_afu_mmio_region region;
+	u64 offset;
+	int ret;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	pdata = dev_get_platdata(&pdev->dev);
+
+	offset = vma->vm_pgoff << PAGE_SHIFT;
+	ret = afu_mmio_region_get_by_offset(pdata, offset, size, &region);
+	if (ret)
+		return ret;
+
+	if (!(region.flags & DFL_PORT_REGION_MMAP))
+		return -EINVAL;
+
+	if ((vma->vm_flags & VM_READ) && !(region.flags & DFL_PORT_REGION_READ))
+		return -EPERM;
+
+	if ((vma->vm_flags & VM_WRITE) &&
+	    !(region.flags & DFL_PORT_REGION_WRITE))
+		return -EPERM;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	return remap_pfn_range(vma, vma->vm_start,
+			(region.phys + (offset - region.offset)) >> PAGE_SHIFT,
+			size, vma->vm_page_prot);
+}
+
 static const struct file_operations afu_fops = {
 	.owner = THIS_MODULE,
 	.open = afu_open,
 	.release = afu_release,
 	.unlocked_ioctl = afu_ioctl,
+	.mmap = afu_mmap,
 };
 
+static int afu_dev_init(struct platform_device *pdev)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	struct dfl_afu *afu;
+
+	afu = devm_kzalloc(&pdev->dev, sizeof(*afu), GFP_KERNEL);
+	if (!afu)
+		return -ENOMEM;
+
+	afu->pdata = pdata;
+
+	mutex_lock(&pdata->lock);
+	dfl_fpga_pdata_set_private(pdata, afu);
+	afu_mmio_region_init(pdata);
+	mutex_unlock(&pdata->lock);
+
+	return 0;
+}
+
+static int afu_dev_destroy(struct platform_device *pdev)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	struct dfl_afu *afu;
+
+	mutex_lock(&pdata->lock);
+	afu = dfl_fpga_pdata_get_private(pdata);
+	afu_mmio_region_destroy(pdata);
+	dfl_fpga_pdata_set_private(pdata, NULL);
+	mutex_unlock(&pdata->lock);
+
+	return 0;
+}
+
 static int port_enable_set(struct platform_device *pdev, bool enable)
 {
 	struct dfl_feature_platform_data *pdata = dev_get_platdata(&pdev->dev);
@@ -312,14 +507,25 @@ static int afu_probe(struct platform_device *pdev)
 
 	dev_dbg(&pdev->dev, "%s\n", __func__);
 
+	ret = afu_dev_init(pdev);
+	if (ret)
+		goto exit;
+
 	ret = dfl_fpga_dev_feature_init(pdev, port_feature_drvs);
 	if (ret)
-		return ret;
+		goto dev_destroy;
 
 	ret = dfl_fpga_dev_ops_register(pdev, &afu_fops, THIS_MODULE);
-	if (ret)
+	if (ret) {
 		dfl_fpga_dev_feature_uinit(pdev);
+		goto dev_destroy;
+	}
+
+	return 0;
 
+dev_destroy:
+	afu_dev_destroy(pdev);
+exit:
 	return ret;
 }
 
@@ -329,6 +535,7 @@ static int afu_remove(struct platform_device *pdev)
 
 	dfl_fpga_dev_ops_unregister(pdev);
 	dfl_fpga_dev_feature_uinit(pdev);
+	afu_dev_destroy(pdev);
 
 	return 0;
 }
diff --git a/drivers/fpga/dfl-afu-region.c b/drivers/fpga/dfl-afu-region.c
new file mode 100644
index 000000000000..0804b7a0c298
--- /dev/null
+++ b/drivers/fpga/dfl-afu-region.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for FPGA Accelerated Function Unit (AFU) MMIO Region Management
+ *
+ * Copyright (C) 2017-2018 Intel Corporation, Inc.
+ *
+ * Authors:
+ *   Wu Hao <hao.wu@intel.com>
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ */
+#include "dfl-afu.h"
+
+/**
+ * afu_mmio_region_init - init function for afu mmio region support
+ * @pdata: afu platform device's pdata.
+ */
+void afu_mmio_region_init(struct dfl_feature_platform_data *pdata)
+{
+	struct dfl_afu *afu = dfl_fpga_pdata_get_private(pdata);
+
+	INIT_LIST_HEAD(&afu->regions);
+}
+
+#define for_each_region(region, afu)	\
+	list_for_each_entry((region), &(afu)->regions, node)
+
+static struct dfl_afu_mmio_region *get_region_by_index(struct dfl_afu *afu,
+						       u32 region_index)
+{
+	struct dfl_afu_mmio_region *region;
+
+	for_each_region(region, afu)
+		if (region->index == region_index)
+			return region;
+
+	return NULL;
+}
+
+/**
+ * afu_mmio_region_add - add a mmio region to given feature dev.
+ *
+ * @region_index: region index.
+ * @region_size: region size.
+ * @phys: region's physical address of this region.
+ * @flags: region flags (access permission).
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int afu_mmio_region_add(struct dfl_feature_platform_data *pdata,
+			u32 region_index, u64 region_size, u64 phys, u32 flags)
+{
+	struct dfl_afu_mmio_region *region;
+	struct dfl_afu *afu;
+	int ret = 0;
+
+	region = devm_kzalloc(&pdata->dev->dev, sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	region->index = region_index;
+	region->size = region_size;
+	region->phys = phys;
+	region->flags = flags;
+
+	mutex_lock(&pdata->lock);
+
+	afu = dfl_fpga_pdata_get_private(pdata);
+
+	/* check if @index already exists */
+	if (get_region_by_index(afu, region_index)) {
+		mutex_unlock(&pdata->lock);
+		ret = -EEXIST;
+		goto exit;
+	}
+
+	region_size = PAGE_ALIGN(region_size);
+	region->offset = afu->region_cur_offset;
+	list_add(&region->node, &afu->regions);
+
+	afu->region_cur_offset += region_size;
+	afu->num_regions++;
+	mutex_unlock(&pdata->lock);
+
+	return 0;
+
+exit:
+	devm_kfree(&pdata->dev->dev, region);
+	return ret;
+}
+
+/**
+ * afu_mmio_region_destroy - destroy all mmio regions under given feature dev.
+ * @pdata: afu platform device's pdata.
+ */
+void afu_mmio_region_destroy(struct dfl_feature_platform_data *pdata)
+{
+	struct dfl_afu *afu = dfl_fpga_pdata_get_private(pdata);
+	struct dfl_afu_mmio_region *tmp, *region;
+
+	list_for_each_entry_safe(region, tmp, &afu->regions, node)
+		devm_kfree(&pdata->dev->dev, region);
+}
+
+/**
+ * afu_mmio_region_get_by_index - find an afu region by index.
+ * @pdata: afu platform device's pdata.
+ * @region_index: region index.
+ * @pregion: ptr to region for result.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int afu_mmio_region_get_by_index(struct dfl_feature_platform_data *pdata,
+				 u32 region_index,
+				 struct dfl_afu_mmio_region *pregion)
+{
+	struct dfl_afu_mmio_region *region;
+	struct dfl_afu *afu;
+	int ret = 0;
+
+	mutex_lock(&pdata->lock);
+	afu = dfl_fpga_pdata_get_private(pdata);
+	region = get_region_by_index(afu, region_index);
+	if (!region) {
+		ret = -EINVAL;
+		goto exit;
+	}
+	*pregion = *region;
+exit:
+	mutex_unlock(&pdata->lock);
+	return ret;
+}
+
+/**
+ * afu_mmio_region_get_by_offset - find an afu mmio region by offset and size
+ *
+ * @pdata: afu platform device's pdata.
+ * @offset: region offset from start of the device fd.
+ * @size: region size.
+ * @pregion: ptr to region for result.
+ *
+ * Find the region which fully contains the region described by input
+ * parameters (offset and size) from the feature dev's region linked list.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int afu_mmio_region_get_by_offset(struct dfl_feature_platform_data *pdata,
+				  u64 offset, u64 size,
+				  struct dfl_afu_mmio_region *pregion)
+{
+	struct dfl_afu_mmio_region *region;
+	struct dfl_afu *afu;
+	int ret = 0;
+
+	mutex_lock(&pdata->lock);
+	afu = dfl_fpga_pdata_get_private(pdata);
+	for_each_region(region, afu)
+		if (region->offset <= offset &&
+		    region->offset + region->size >= offset + size) {
+			*pregion = *region;
+			goto exit;
+		}
+	ret = -EINVAL;
+exit:
+	mutex_unlock(&pdata->lock);
+	return ret;
+}
diff --git a/drivers/fpga/dfl-afu.h b/drivers/fpga/dfl-afu.h
new file mode 100644
index 000000000000..11ce2cf99759
--- /dev/null
+++ b/drivers/fpga/dfl-afu.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Header file for FPGA Accelerated Function Unit (AFU) Driver
+ *
+ * Copyright (C) 2017-2018 Intel Corporation, Inc.
+ *
+ * Authors:
+ *     Wu Hao <hao.wu@intel.com>
+ *     Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *     Joseph Grecco <joe.grecco@intel.com>
+ *     Enno Luebbers <enno.luebbers@intel.com>
+ *     Tim Whisonant <tim.whisonant@intel.com>
+ *     Ananda Ravuri <ananda.ravuri@intel.com>
+ *     Henry Mitchel <henry.mitchel@intel.com>
+ */
+
+#ifndef __DFL_AFU_H
+#define __DFL_AFU_H
+
+#include <linux/mm.h>
+
+#include "dfl.h"
+
+/**
+ * struct dfl_afu_mmio_region - afu mmio region data structure
+ *
+ * @index: region index.
+ * @flags: region flags (access permission).
+ * @size: region size.
+ * @offset: region offset from start of the device fd.
+ * @phys: region's physical address.
+ * @node: node to add to afu feature dev's region list.
+ */
+struct dfl_afu_mmio_region {
+	u32 index;
+	u32 flags;
+	u64 size;
+	u64 offset;
+	u64 phys;
+	struct list_head node;
+};
+
+/**
+ * struct dfl_afu - afu device data structure
+ *
+ * @region_cur_offset: current region offset from start to the device fd.
+ * @num_regions: num of mmio regions.
+ * @regions: the mmio region linked list of this afu feature device.
+ * @num_umsgs: num of umsgs.
+ * @pdata: afu platform device's pdata.
+ */
+struct dfl_afu {
+	u64 region_cur_offset;
+	int num_regions;
+	u8 num_umsgs;
+	struct list_head regions;
+
+	struct dfl_feature_platform_data *pdata;
+};
+
+void afu_mmio_region_init(struct dfl_feature_platform_data *pdata);
+int afu_mmio_region_add(struct dfl_feature_platform_data *pdata,
+			u32 region_index, u64 region_size, u64 phys, u32 flags);
+void afu_mmio_region_destroy(struct dfl_feature_platform_data *pdata);
+int afu_mmio_region_get_by_index(struct dfl_feature_platform_data *pdata,
+				 u32 region_index,
+				 struct dfl_afu_mmio_region *pregion);
+int afu_mmio_region_get_by_offset(struct dfl_feature_platform_data *pdata,
+				  u64 offset, u64 size,
+				  struct dfl_afu_mmio_region *pregion);
+#endif
diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h
index e6b4dd26cc68..a3ccdfb115a5 100644
--- a/include/uapi/linux/fpga-dfl.h
+++ b/include/uapi/linux/fpga-dfl.h
@@ -66,6 +66,54 @@
 
 #define DFL_FPGA_PORT_RESET		_IO(DFL_FPGA_MAGIC, DFL_PORT_BASE + 0)
 
+/**
+ * DFL_FPGA_PORT_GET_INFO - _IOR(DFL_FPGA_MAGIC, DFL_PORT_BASE + 1,
+ *						struct dfl_fpga_port_info)
+ *
+ * Retrieve information about the fpga port.
+ * Driver fills the info in provided struct dfl_fpga_port_info.
+ * Return: 0 on success, -errno on failure.
+ */
+struct dfl_fpga_port_info {
+	/* Input */
+	__u32 argsz;		/* Structure length */
+	/* Output */
+	__u32 flags;		/* Zero for now */
+	__u32 num_regions;	/* The number of supported regions */
+	__u32 num_umsgs;	/* The number of allocated umsgs */
+};
+
+#define DFL_FPGA_PORT_GET_INFO		_IO(DFL_FPGA_MAGIC, DFL_PORT_BASE + 1)
+
+/**
+ * FPGA_PORT_GET_REGION_INFO - _IOWR(FPGA_MAGIC, PORT_BASE + 2,
+ *					struct dfl_fpga_port_region_info)
+ *
+ * Retrieve information about a device memory region.
+ * Caller provides struct dfl_fpga_port_region_info with index value set.
+ * Driver returns the region info in other fields.
+ * Return: 0 on success, -errno on failure.
+ */
+struct dfl_fpga_port_region_info {
+	/* input */
+	__u32 argsz;		/* Structure length */
+	/* Output */
+	__u32 flags;		/* Access permission */
+#define DFL_PORT_REGION_READ	(1 << 0)	/* Region is readable */
+#define DFL_PORT_REGION_WRITE	(1 << 1)	/* Region is writable */
+#define DFL_PORT_REGION_MMAP	(1 << 2)	/* Can be mmaped to userspace */
+	/* Input */
+	__u32 index;		/* Region index */
+#define DFL_PORT_REGION_INDEX_AFU	0	/* AFU */
+#define DFL_PORT_REGION_INDEX_STP	1	/* Signal Tap */
+	__u32 padding;
+	/* Output */
+	__u64 size;		/* Region size (bytes) */
+	__u64 offset;		/* Region offset from start of device fd */
+};
+
+#define DFL_FPGA_PORT_GET_REGION_INFO	_IO(DFL_FPGA_MAGIC, DFL_PORT_BASE + 2)
+
 /* IOCTLs for FME file descriptor */
 
 /**
-- 
cgit v1.2.3


From fa8dda1edef9ebc3af467c644c5533ac97171e12 Mon Sep 17 00:00:00 2001
From: Wu Hao <hao.wu@intel.com>
Date: Sat, 30 Jun 2018 08:53:35 +0800
Subject: fpga: dfl: afu: add DFL_FPGA_PORT_DMA_MAP/UNMAP ioctls support

DMA memory regions are required for Accelerated Function Unit (AFU) usage.
These two ioctls allow user space applications to map user memory regions
for dma, and unmap them after use. Iova is returned from driver to user
space application via DFL_FPGA_PORT_DMA_MAP ioctl. Application needs to
unmap it after use, otherwise, driver will unmap them in device file
release operation.

Each AFU has its own rb tree to keep track of its mapped DMA regions.

Ioctl interfaces:
* DFL_FPGA_PORT_DMA_MAP
  Do the dma mapping per user_addr and length provided by user.
  Return iova in provided struct dfl_fpga_port_dma_map.

* DFL_FPGA_PORT_DMA_UNMAP
  Unmap the dma region per iova provided by user.

Signed-off-by: Tim Whisonant <tim.whisonant@intel.com>
Signed-off-by: Enno Luebbers <enno.luebbers@intel.com>
Signed-off-by: Shiva Rao <shiva.rao@intel.com>
Signed-off-by: Christopher Rauer <christopher.rauer@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/Makefile             |   2 +-
 drivers/fpga/dfl-afu-dma-region.c | 463 ++++++++++++++++++++++++++++++++++++++
 drivers/fpga/dfl-afu-main.c       |  61 ++++-
 drivers/fpga/dfl-afu.h            |  31 ++-
 include/uapi/linux/fpga-dfl.h     |  37 +++
 5 files changed, 591 insertions(+), 3 deletions(-)
 create mode 100644 drivers/fpga/dfl-afu-dma-region.c

(limited to 'include/uapi/linux')

diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
index a44d50dd0b70..7a2d73ba7122 100644
--- a/drivers/fpga/Makefile
+++ b/drivers/fpga/Makefile
@@ -38,7 +38,7 @@ obj-$(CONFIG_FPGA_DFL_FME_REGION)	+= dfl-fme-region.o
 obj-$(CONFIG_FPGA_DFL_AFU)		+= dfl-afu.o
 
 dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
-dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o
+dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
 
 # Drivers for FPGAs which implement DFL
 obj-$(CONFIG_FPGA_DFL_PCI)		+= dfl-pci.o
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
new file mode 100644
index 000000000000..0e81d33af856
--- /dev/null
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -0,0 +1,463 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for FPGA Accelerated Function Unit (AFU) DMA Region Management
+ *
+ * Copyright (C) 2017-2018 Intel Corporation, Inc.
+ *
+ * Authors:
+ *   Wu Hao <hao.wu@intel.com>
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+
+#include "dfl-afu.h"
+
+static void put_all_pages(struct page **pages, int npages)
+{
+	int i;
+
+	for (i = 0; i < npages; i++)
+		if (pages[i])
+			put_page(pages[i]);
+}
+
+void afu_dma_region_init(struct dfl_feature_platform_data *pdata)
+{
+	struct dfl_afu *afu = dfl_fpga_pdata_get_private(pdata);
+
+	afu->dma_regions = RB_ROOT;
+}
+
+/**
+ * afu_dma_adjust_locked_vm - adjust locked memory
+ * @dev: port device
+ * @npages: number of pages
+ * @incr: increase or decrease locked memory
+ *
+ * Increase or decrease the locked memory size with npages input.
+ *
+ * Return 0 on success.
+ * Return -ENOMEM if locked memory size is over the limit and no CAP_IPC_LOCK.
+ */
+static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr)
+{
+	unsigned long locked, lock_limit;
+	int ret = 0;
+
+	/* the task is exiting. */
+	if (!current->mm)
+		return 0;
+
+	down_write(&current->mm->mmap_sem);
+
+	if (incr) {
+		locked = current->mm->locked_vm + npages;
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			ret = -ENOMEM;
+		else
+			current->mm->locked_vm += npages;
+	} else {
+		if (WARN_ON_ONCE(npages > current->mm->locked_vm))
+			npages = current->mm->locked_vm;
+		current->mm->locked_vm -= npages;
+	}
+
+	dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid,
+		incr ? '+' : '-', npages << PAGE_SHIFT,
+		current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK),
+		ret ? "- execeeded" : "");
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+/**
+ * afu_dma_pin_pages - pin pages of given dma memory region
+ * @pdata: feature device platform data
+ * @region: dma memory region to be pinned
+ *
+ * Pin all the pages of given dfl_afu_dma_region.
+ * Return 0 for success or negative error code.
+ */
+static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
+			     struct dfl_afu_dma_region *region)
+{
+	int npages = region->length >> PAGE_SHIFT;
+	struct device *dev = &pdata->dev->dev;
+	int ret, pinned;
+
+	ret = afu_dma_adjust_locked_vm(dev, npages, true);
+	if (ret)
+		return ret;
+
+	region->pages = kcalloc(npages, sizeof(struct page *), GFP_KERNEL);
+	if (!region->pages) {
+		ret = -ENOMEM;
+		goto unlock_vm;
+	}
+
+	pinned = get_user_pages_fast(region->user_addr, npages, 1,
+				     region->pages);
+	if (pinned < 0) {
+		ret = pinned;
+		goto put_pages;
+	} else if (pinned != npages) {
+		ret = -EFAULT;
+		goto free_pages;
+	}
+
+	dev_dbg(dev, "%d pages pinned\n", pinned);
+
+	return 0;
+
+put_pages:
+	put_all_pages(region->pages, pinned);
+free_pages:
+	kfree(region->pages);
+unlock_vm:
+	afu_dma_adjust_locked_vm(dev, npages, false);
+	return ret;
+}
+
+/**
+ * afu_dma_unpin_pages - unpin pages of given dma memory region
+ * @pdata: feature device platform data
+ * @region: dma memory region to be unpinned
+ *
+ * Unpin all the pages of given dfl_afu_dma_region.
+ * Return 0 for success or negative error code.
+ */
+static void afu_dma_unpin_pages(struct dfl_feature_platform_data *pdata,
+				struct dfl_afu_dma_region *region)
+{
+	long npages = region->length >> PAGE_SHIFT;
+	struct device *dev = &pdata->dev->dev;
+
+	put_all_pages(region->pages, npages);
+	kfree(region->pages);
+	afu_dma_adjust_locked_vm(dev, npages, false);
+
+	dev_dbg(dev, "%ld pages unpinned\n", npages);
+}
+
+/**
+ * afu_dma_check_continuous_pages - check if pages are continuous
+ * @region: dma memory region
+ *
+ * Return true if pages of given dma memory region have continuous physical
+ * address, otherwise return false.
+ */
+static bool afu_dma_check_continuous_pages(struct dfl_afu_dma_region *region)
+{
+	int npages = region->length >> PAGE_SHIFT;
+	int i;
+
+	for (i = 0; i < npages - 1; i++)
+		if (page_to_pfn(region->pages[i]) + 1 !=
+				page_to_pfn(region->pages[i + 1]))
+			return false;
+
+	return true;
+}
+
+/**
+ * dma_region_check_iova - check if memory area is fully contained in the region
+ * @region: dma memory region
+ * @iova: address of the dma memory area
+ * @size: size of the dma memory area
+ *
+ * Compare the dma memory area defined by @iova and @size with given dma region.
+ * Return true if memory area is fully contained in the region, otherwise false.
+ */
+static bool dma_region_check_iova(struct dfl_afu_dma_region *region,
+				  u64 iova, u64 size)
+{
+	if (!size && region->iova != iova)
+		return false;
+
+	return (region->iova <= iova) &&
+		(region->length + region->iova >= iova + size);
+}
+
+/**
+ * afu_dma_region_add - add given dma region to rbtree
+ * @pdata: feature device platform data
+ * @region: dma region to be added
+ *
+ * Return 0 for success, -EEXIST if dma region has already been added.
+ *
+ * Needs to be called with pdata->lock heold.
+ */
+static int afu_dma_region_add(struct dfl_feature_platform_data *pdata,
+			      struct dfl_afu_dma_region *region)
+{
+	struct dfl_afu *afu = dfl_fpga_pdata_get_private(pdata);
+	struct rb_node **new, *parent = NULL;
+
+	dev_dbg(&pdata->dev->dev, "add region (iova = %llx)\n",
+		(unsigned long long)region->iova);
+
+	new = &afu->dma_regions.rb_node;
+
+	while (*new) {
+		struct dfl_afu_dma_region *this;
+
+		this = container_of(*new, struct dfl_afu_dma_region, node);
+
+		parent = *new;
+
+		if (dma_region_check_iova(this, region->iova, region->length))
+			return -EEXIST;
+
+		if (region->iova < this->iova)
+			new = &((*new)->rb_left);
+		else if (region->iova > this->iova)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&region->node, parent, new);
+	rb_insert_color(&region->node, &afu->dma_regions);
+
+	return 0;
+}
+
+/**
+ * afu_dma_region_remove - remove given dma region from rbtree
+ * @pdata: feature device platform data
+ * @region: dma region to be removed
+ *
+ * Needs to be called with pdata->lock heold.
+ */
+static void afu_dma_region_remove(struct dfl_feature_platform_data *pdata,
+				  struct dfl_afu_dma_region *region)
+{
+	struct dfl_afu *afu;
+
+	dev_dbg(&pdata->dev->dev, "del region (iova = %llx)\n",
+		(unsigned long long)region->iova);
+
+	afu = dfl_fpga_pdata_get_private(pdata);
+	rb_erase(&region->node, &afu->dma_regions);
+}
+
+/**
+ * afu_dma_region_destroy - destroy all regions in rbtree
+ * @pdata: feature device platform data
+ *
+ * Needs to be called with pdata->lock heold.
+ */
+void afu_dma_region_destroy(struct dfl_feature_platform_data *pdata)
+{
+	struct dfl_afu *afu = dfl_fpga_pdata_get_private(pdata);
+	struct rb_node *node = rb_first(&afu->dma_regions);
+	struct dfl_afu_dma_region *region;
+
+	while (node) {
+		region = container_of(node, struct dfl_afu_dma_region, node);
+
+		dev_dbg(&pdata->dev->dev, "del region (iova = %llx)\n",
+			(unsigned long long)region->iova);
+
+		rb_erase(node, &afu->dma_regions);
+
+		if (region->iova)
+			dma_unmap_page(dfl_fpga_pdata_to_parent(pdata),
+				       region->iova, region->length,
+				       DMA_BIDIRECTIONAL);
+
+		if (region->pages)
+			afu_dma_unpin_pages(pdata, region);
+
+		node = rb_next(node);
+		kfree(region);
+	}
+}
+
+/**
+ * afu_dma_region_find - find the dma region from rbtree based on iova and size
+ * @pdata: feature device platform data
+ * @iova: address of the dma memory area
+ * @size: size of the dma memory area
+ *
+ * It finds the dma region from the rbtree based on @iova and @size:
+ * - if @size == 0, it finds the dma region which starts from @iova
+ * - otherwise, it finds the dma region which fully contains
+ *   [@iova, @iova+size)
+ * If nothing is matched returns NULL.
+ *
+ * Needs to be called with pdata->lock held.
+ */
+struct dfl_afu_dma_region *
+afu_dma_region_find(struct dfl_feature_platform_data *pdata, u64 iova, u64 size)
+{
+	struct dfl_afu *afu = dfl_fpga_pdata_get_private(pdata);
+	struct rb_node *node = afu->dma_regions.rb_node;
+	struct device *dev = &pdata->dev->dev;
+
+	while (node) {
+		struct dfl_afu_dma_region *region;
+
+		region = container_of(node, struct dfl_afu_dma_region, node);
+
+		if (dma_region_check_iova(region, iova, size)) {
+			dev_dbg(dev, "find region (iova = %llx)\n",
+				(unsigned long long)region->iova);
+			return region;
+		}
+
+		if (iova < region->iova)
+			node = node->rb_left;
+		else if (iova > region->iova)
+			node = node->rb_right;
+		else
+			/* the iova region is not fully covered. */
+			break;
+	}
+
+	dev_dbg(dev, "region with iova %llx and size %llx is not found\n",
+		(unsigned long long)iova, (unsigned long long)size);
+
+	return NULL;
+}
+
+/**
+ * afu_dma_region_find_iova - find the dma region from rbtree by iova
+ * @pdata: feature device platform data
+ * @iova: address of the dma region
+ *
+ * Needs to be called with pdata->lock held.
+ */
+static struct dfl_afu_dma_region *
+afu_dma_region_find_iova(struct dfl_feature_platform_data *pdata, u64 iova)
+{
+	return afu_dma_region_find(pdata, iova, 0);
+}
+
+/**
+ * afu_dma_map_region - map memory region for dma
+ * @pdata: feature device platform data
+ * @user_addr: address of the memory region
+ * @length: size of the memory region
+ * @iova: pointer of iova address
+ *
+ * Map memory region defined by @user_addr and @length, and return dma address
+ * of the memory region via @iova.
+ * Return 0 for success, otherwise error code.
+ */
+int afu_dma_map_region(struct dfl_feature_platform_data *pdata,
+		       u64 user_addr, u64 length, u64 *iova)
+{
+	struct dfl_afu_dma_region *region;
+	int ret;
+
+	/*
+	 * Check Inputs, only accept page-aligned user memory region with
+	 * valid length.
+	 */
+	if (!PAGE_ALIGNED(user_addr) || !PAGE_ALIGNED(length) || !length)
+		return -EINVAL;
+
+	/* Check overflow */
+	if (user_addr + length < user_addr)
+		return -EINVAL;
+
+	if (!access_ok(VERIFY_WRITE, (void __user *)(unsigned long)user_addr,
+		       length))
+		return -EINVAL;
+
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	region->user_addr = user_addr;
+	region->length = length;
+
+	/* Pin the user memory region */
+	ret = afu_dma_pin_pages(pdata, region);
+	if (ret) {
+		dev_err(&pdata->dev->dev, "failed to pin memory region\n");
+		goto free_region;
+	}
+
+	/* Only accept continuous pages, return error else */
+	if (!afu_dma_check_continuous_pages(region)) {
+		dev_err(&pdata->dev->dev, "pages are not continuous\n");
+		ret = -EINVAL;
+		goto unpin_pages;
+	}
+
+	/* As pages are continuous then start to do DMA mapping */
+	region->iova = dma_map_page(dfl_fpga_pdata_to_parent(pdata),
+				    region->pages[0], 0,
+				    region->length,
+				    DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(&pdata->dev->dev, region->iova)) {
+		dev_err(&pdata->dev->dev, "failed to map for dma\n");
+		ret = -EFAULT;
+		goto unpin_pages;
+	}
+
+	*iova = region->iova;
+
+	mutex_lock(&pdata->lock);
+	ret = afu_dma_region_add(pdata, region);
+	mutex_unlock(&pdata->lock);
+	if (ret) {
+		dev_err(&pdata->dev->dev, "failed to add dma region\n");
+		goto unmap_dma;
+	}
+
+	return 0;
+
+unmap_dma:
+	dma_unmap_page(dfl_fpga_pdata_to_parent(pdata),
+		       region->iova, region->length, DMA_BIDIRECTIONAL);
+unpin_pages:
+	afu_dma_unpin_pages(pdata, region);
+free_region:
+	kfree(region);
+	return ret;
+}
+
+/**
+ * afu_dma_unmap_region - unmap dma memory region
+ * @pdata: feature device platform data
+ * @iova: dma address of the region
+ *
+ * Unmap dma memory region based on @iova.
+ * Return 0 for success, otherwise error code.
+ */
+int afu_dma_unmap_region(struct dfl_feature_platform_data *pdata, u64 iova)
+{
+	struct dfl_afu_dma_region *region;
+
+	mutex_lock(&pdata->lock);
+	region = afu_dma_region_find_iova(pdata, iova);
+	if (!region) {
+		mutex_unlock(&pdata->lock);
+		return -EINVAL;
+	}
+
+	if (region->in_use) {
+		mutex_unlock(&pdata->lock);
+		return -EBUSY;
+	}
+
+	afu_dma_region_remove(pdata, region);
+	mutex_unlock(&pdata->lock);
+
+	dma_unmap_page(dfl_fpga_pdata_to_parent(pdata),
+		       region->iova, region->length, DMA_BIDIRECTIONAL);
+	afu_dma_unpin_pages(pdata, region);
+	kfree(region);
+
+	return 0;
+}
diff --git a/drivers/fpga/dfl-afu-main.c b/drivers/fpga/dfl-afu-main.c
index f67a78d7e9ad..02baa6a227c0 100644
--- a/drivers/fpga/dfl-afu-main.c
+++ b/drivers/fpga/dfl-afu-main.c
@@ -293,7 +293,11 @@ static int afu_release(struct inode *inode, struct file *filp)
 
 	pdata = dev_get_platdata(&pdev->dev);
 
-	port_reset(pdev);
+	mutex_lock(&pdata->lock);
+	__port_reset(pdev);
+	afu_dma_region_destroy(pdata);
+	mutex_unlock(&pdata->lock);
+
 	dfl_feature_dev_use_end(pdata);
 
 	return 0;
@@ -364,6 +368,55 @@ static long afu_ioctl_get_region_info(struct dfl_feature_platform_data *pdata,
 	return 0;
 }
 
+static long
+afu_ioctl_dma_map(struct dfl_feature_platform_data *pdata, void __user *arg)
+{
+	struct dfl_fpga_port_dma_map map;
+	unsigned long minsz;
+	long ret;
+
+	minsz = offsetofend(struct dfl_fpga_port_dma_map, iova);
+
+	if (copy_from_user(&map, arg, minsz))
+		return -EFAULT;
+
+	if (map.argsz < minsz || map.flags)
+		return -EINVAL;
+
+	ret = afu_dma_map_region(pdata, map.user_addr, map.length, &map.iova);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(arg, &map, sizeof(map))) {
+		afu_dma_unmap_region(pdata, map.iova);
+		return -EFAULT;
+	}
+
+	dev_dbg(&pdata->dev->dev, "dma map: ua=%llx, len=%llx, iova=%llx\n",
+		(unsigned long long)map.user_addr,
+		(unsigned long long)map.length,
+		(unsigned long long)map.iova);
+
+	return 0;
+}
+
+static long
+afu_ioctl_dma_unmap(struct dfl_feature_platform_data *pdata, void __user *arg)
+{
+	struct dfl_fpga_port_dma_unmap unmap;
+	unsigned long minsz;
+
+	minsz = offsetofend(struct dfl_fpga_port_dma_unmap, iova);
+
+	if (copy_from_user(&unmap, arg, minsz))
+		return -EFAULT;
+
+	if (unmap.argsz < minsz || unmap.flags)
+		return -EINVAL;
+
+	return afu_dma_unmap_region(pdata, unmap.iova);
+}
+
 static long afu_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct platform_device *pdev = filp->private_data;
@@ -384,6 +437,10 @@ static long afu_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return afu_ioctl_get_info(pdata, (void __user *)arg);
 	case DFL_FPGA_PORT_GET_REGION_INFO:
 		return afu_ioctl_get_region_info(pdata, (void __user *)arg);
+	case DFL_FPGA_PORT_DMA_MAP:
+		return afu_ioctl_dma_map(pdata, (void __user *)arg);
+	case DFL_FPGA_PORT_DMA_UNMAP:
+		return afu_ioctl_dma_unmap(pdata, (void __user *)arg);
 	default:
 		/*
 		 * Let sub-feature's ioctl function to handle the cmd
@@ -460,6 +517,7 @@ static int afu_dev_init(struct platform_device *pdev)
 	mutex_lock(&pdata->lock);
 	dfl_fpga_pdata_set_private(pdata, afu);
 	afu_mmio_region_init(pdata);
+	afu_dma_region_init(pdata);
 	mutex_unlock(&pdata->lock);
 
 	return 0;
@@ -473,6 +531,7 @@ static int afu_dev_destroy(struct platform_device *pdev)
 	mutex_lock(&pdata->lock);
 	afu = dfl_fpga_pdata_get_private(pdata);
 	afu_mmio_region_destroy(pdata);
+	afu_dma_region_destroy(pdata);
 	dfl_fpga_pdata_set_private(pdata, NULL);
 	mutex_unlock(&pdata->lock);
 
diff --git a/drivers/fpga/dfl-afu.h b/drivers/fpga/dfl-afu.h
index 11ce2cf99759..0c7630ae3cda 100644
--- a/drivers/fpga/dfl-afu.h
+++ b/drivers/fpga/dfl-afu.h
@@ -40,12 +40,32 @@ struct dfl_afu_mmio_region {
 	struct list_head node;
 };
 
+/**
+ * struct fpga_afu_dma_region - afu DMA region data structure
+ *
+ * @user_addr: region userspace virtual address.
+ * @length: region length.
+ * @iova: region IO virtual address.
+ * @pages: ptr to pages of this region.
+ * @node: rb tree node.
+ * @in_use: flag to indicate if this region is in_use.
+ */
+struct dfl_afu_dma_region {
+	u64 user_addr;
+	u64 length;
+	u64 iova;
+	struct page **pages;
+	struct rb_node node;
+	bool in_use;
+};
+
 /**
  * struct dfl_afu - afu device data structure
  *
  * @region_cur_offset: current region offset from start to the device fd.
  * @num_regions: num of mmio regions.
  * @regions: the mmio region linked list of this afu feature device.
+ * @dma_regions: root of dma regions rb tree.
  * @num_umsgs: num of umsgs.
  * @pdata: afu platform device's pdata.
  */
@@ -54,6 +74,7 @@ struct dfl_afu {
 	int num_regions;
 	u8 num_umsgs;
 	struct list_head regions;
+	struct rb_root dma_regions;
 
 	struct dfl_feature_platform_data *pdata;
 };
@@ -68,4 +89,12 @@ int afu_mmio_region_get_by_index(struct dfl_feature_platform_data *pdata,
 int afu_mmio_region_get_by_offset(struct dfl_feature_platform_data *pdata,
 				  u64 offset, u64 size,
 				  struct dfl_afu_mmio_region *pregion);
-#endif
+void afu_dma_region_init(struct dfl_feature_platform_data *pdata);
+void afu_dma_region_destroy(struct dfl_feature_platform_data *pdata);
+int afu_dma_map_region(struct dfl_feature_platform_data *pdata,
+		       u64 user_addr, u64 length, u64 *iova);
+int afu_dma_unmap_region(struct dfl_feature_platform_data *pdata, u64 iova);
+struct dfl_afu_dma_region *
+afu_dma_region_find(struct dfl_feature_platform_data *pdata,
+		    u64 iova, u64 size);
+#endif /* __DFL_AFU_H */
diff --git a/include/uapi/linux/fpga-dfl.h b/include/uapi/linux/fpga-dfl.h
index a3ccdfb115a5..2e324e515c41 100644
--- a/include/uapi/linux/fpga-dfl.h
+++ b/include/uapi/linux/fpga-dfl.h
@@ -114,6 +114,43 @@ struct dfl_fpga_port_region_info {
 
 #define DFL_FPGA_PORT_GET_REGION_INFO	_IO(DFL_FPGA_MAGIC, DFL_PORT_BASE + 2)
 
+/**
+ * DFL_FPGA_PORT_DMA_MAP - _IOWR(DFL_FPGA_MAGIC, DFL_PORT_BASE + 3,
+ *						struct dfl_fpga_port_dma_map)
+ *
+ * Map the dma memory per user_addr and length which are provided by caller.
+ * Driver fills the iova in provided struct afu_port_dma_map.
+ * This interface only accepts page-size aligned user memory for dma mapping.
+ * Return: 0 on success, -errno on failure.
+ */
+struct dfl_fpga_port_dma_map {
+	/* Input */
+	__u32 argsz;		/* Structure length */
+	__u32 flags;		/* Zero for now */
+	__u64 user_addr;        /* Process virtual address */
+	__u64 length;           /* Length of mapping (bytes)*/
+	/* Output */
+	__u64 iova;             /* IO virtual address */
+};
+
+#define DFL_FPGA_PORT_DMA_MAP		_IO(DFL_FPGA_MAGIC, DFL_PORT_BASE + 3)
+
+/**
+ * DFL_FPGA_PORT_DMA_UNMAP - _IOW(FPGA_MAGIC, PORT_BASE + 4,
+ *						struct dfl_fpga_port_dma_unmap)
+ *
+ * Unmap the dma memory per iova provided by caller.
+ * Return: 0 on success, -errno on failure.
+ */
+struct dfl_fpga_port_dma_unmap {
+	/* Input */
+	__u32 argsz;		/* Structure length */
+	__u32 flags;		/* Zero for now */
+	__u64 iova;		/* IO virtual address */
+};
+
+#define DFL_FPGA_PORT_DMA_UNMAP		_IO(DFL_FPGA_MAGIC, DFL_PORT_BASE + 4)
+
 /* IOCTLs for FME file descriptor */
 
 /**
-- 
cgit v1.2.3


From 45cd74cb5061781e793a098c420a7f548fdc9e7d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 10 Jul 2018 17:15:38 +0200
Subject: eventpoll.h: wrap casts in () properly

When importing the latest copy of the kernel headers into Bionic,
Christpher and Elliott noticed that the eventpoll.h casts were not
wrapped in ().  As it is, clang complains about macros without
surrounding (), so this makes it a pain for userspace tools.

So fix it up by adding another () pair, and make them line up purty by
using tabs.

Fixes: 65aaf87b3aa2 ("add EPOLLNVAL, annotate EPOLL... and event_poll->event")
Reported-by: Christopher Ferris <cferris@google.com>
Reported-by: Elliott Hughes <enh@google.com>
Cc: stable <stable@vger.kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/eventpoll.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index bf48e71f2634..8a3432d0f0dc 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -42,7 +42,7 @@
 #define EPOLLRDHUP	(__force __poll_t)0x00002000
 
 /* Set exclusive wakeup mode for the target file descriptor */
-#define EPOLLEXCLUSIVE (__force __poll_t)(1U << 28)
+#define EPOLLEXCLUSIVE	((__force __poll_t)(1U << 28))
 
 /*
  * Request the handling of system wakeup events so as to prevent system suspends
@@ -54,13 +54,13 @@
  *
  * Requires CAP_BLOCK_SUSPEND
  */
-#define EPOLLWAKEUP (__force __poll_t)(1U << 29)
+#define EPOLLWAKEUP	((__force __poll_t)(1U << 29))
 
 /* Set the One Shot behaviour for the target file descriptor */
-#define EPOLLONESHOT (__force __poll_t)(1U << 30)
+#define EPOLLONESHOT	((__force __poll_t)(1U << 30))
 
 /* Set the Edge Triggered behaviour for the target file descriptor */
-#define EPOLLET (__force __poll_t)(1U << 31)
+#define EPOLLET		((__force __poll_t)(1U << 31))
 
 /* 
  * On x86-64 make the 64bit structure have the same alignment as the
-- 
cgit v1.2.3


From 0f292f023ffcc67ec49d63dcb7fe388711cbb83a Mon Sep 17 00:00:00 2001
From: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Date: Wed, 11 Jul 2018 13:37:53 +0100
Subject: ALSA: usb-audio: Add support for Processing Units in UAC3

This patch adds support for the Processig Units defined in
the UAC3 spec. The main difference with the previous specs
is the lack of on/off switches in the controls for these
units and the addiction of the new Multi Function Processing
Unit.

The current version of the UAC3 spec doesn't define any
useful controls for the new Multi Function Processing Unit
so no control will get created once this unit is parsed.

Signed-off-by: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/usb/audio-v3.h   | 15 +++++++++++++
 include/uapi/linux/usb/audio.h | 49 ++++++++++++++++++++++++++++++++--------
 sound/usb/mixer.c              | 51 ++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 104 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/usb/audio-v3.h b/include/linux/usb/audio-v3.h
index a710e28b5215..334bfa6dfb47 100644
--- a/include/linux/usb/audio-v3.h
+++ b/include/linux/usb/audio-v3.h
@@ -387,6 +387,12 @@ struct uac3_interrupt_data_msg {
 #define UAC3_CONNECTORS			0x0f
 #define UAC3_POWER_DOMAIN		0x10
 
+/* A.20 PROCESSING UNIT PROCESS TYPES */
+#define UAC3_PROCESS_UNDEFINED		0x00
+#define UAC3_PROCESS_UP_DOWNMIX		0x01
+#define UAC3_PROCESS_STEREO_EXTENDER	0x02
+#define UAC3_PROCESS_MULTI_FUNCTION	0x03
+
 /* A.22 AUDIO CLASS-SPECIFIC REQUEST CODES */
 /* see audio-v2.h for the rest, which is identical to v2 */
 #define UAC3_CS_REQ_INTEN			0x04
@@ -406,6 +412,15 @@ struct uac3_interrupt_data_msg {
 #define UAC3_TE_OVERFLOW			0x04
 #define UAC3_TE_LATENCY 			0x05
 
+/* A.23.10 PROCESSING UNITS CONTROL SELECTROS */
+
+/* Up/Down Mixer */
+#define UAC3_UD_MODE_SELECT			0x01
+
+/* Stereo Extender */
+#define UAC3_EXT_WIDTH_CONTROL			0x01
+
+
 /* BADD predefined Unit/Terminal values */
 #define UAC3_BADD_IT_ID1	1  /* Input Terminal ID1: bTerminalID = 1 */
 #define UAC3_BADD_FU_ID2	2  /* Feature Unit ID2: bUnitID = 2 */
diff --git a/include/uapi/linux/usb/audio.h b/include/uapi/linux/usb/audio.h
index 74e520fb944f..ddc5396800aa 100644
--- a/include/uapi/linux/usb/audio.h
+++ b/include/uapi/linux/usb/audio.h
@@ -390,33 +390,64 @@ static inline __u8 uac_processing_unit_iChannelNames(struct uac_processing_unit_
 static inline __u8 uac_processing_unit_bControlSize(struct uac_processing_unit_descriptor *desc,
 						    int protocol)
 {
-	return (protocol == UAC_VERSION_1) ?
-		desc->baSourceID[desc->bNrInPins + 4] :
-		2; /* in UAC2, this value is constant */
+	switch (protocol) {
+	case UAC_VERSION_1:
+		return desc->baSourceID[desc->bNrInPins + 4];
+	case UAC_VERSION_2:
+		return 2; /* in UAC2, this value is constant */
+	case UAC_VERSION_3:
+		return 4; /* in UAC3, this value is constant */
+	default:
+		return 1;
+	}
 }
 
 static inline __u8 *uac_processing_unit_bmControls(struct uac_processing_unit_descriptor *desc,
 						   int protocol)
 {
-	return (protocol == UAC_VERSION_1) ?
-		&desc->baSourceID[desc->bNrInPins + 5] :
-		&desc->baSourceID[desc->bNrInPins + 6];
+	switch (protocol) {
+	case UAC_VERSION_1:
+		return &desc->baSourceID[desc->bNrInPins + 5];
+	case UAC_VERSION_2:
+		return &desc->baSourceID[desc->bNrInPins + 6];
+	case UAC_VERSION_3:
+		return &desc->baSourceID[desc->bNrInPins + 2];
+	default:
+		return NULL;
+	}
 }
 
 static inline __u8 uac_processing_unit_iProcessing(struct uac_processing_unit_descriptor *desc,
 						   int protocol)
 {
 	__u8 control_size = uac_processing_unit_bControlSize(desc, protocol);
-	return *(uac_processing_unit_bmControls(desc, protocol)
-			+ control_size);
+
+	switch (protocol) {
+	case UAC_VERSION_1:
+	case UAC_VERSION_2:
+	default:
+		return *(uac_processing_unit_bmControls(desc, protocol)
+			 + control_size);
+	case UAC_VERSION_3:
+		return 0; /* UAC3 does not have this field */
+	}
 }
 
 static inline __u8 *uac_processing_unit_specific(struct uac_processing_unit_descriptor *desc,
 						 int protocol)
 {
 	__u8 control_size = uac_processing_unit_bControlSize(desc, protocol);
-	return uac_processing_unit_bmControls(desc, protocol)
+
+	switch (protocol) {
+	case UAC_VERSION_1:
+	case UAC_VERSION_2:
+	default:
+		return uac_processing_unit_bmControls(desc, protocol)
 			+ control_size + 1;
+	case UAC_VERSION_3:
+		return uac_processing_unit_bmControls(desc, protocol)
+			+ control_size;
+	}
 }
 
 /* 4.5.2 Class-Specific AS Interface Descriptor */
diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index bfb3484096a6..39fde49e8749 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -953,6 +953,23 @@ static int check_input_term(struct mixer_build *state, int id,
 
 				return 0;
 			}
+			case UAC3_PROCESSING_UNIT: {
+				struct uac_processing_unit_descriptor *d = p1;
+
+				if (!d->bNrInPins)
+					return -EINVAL;
+
+				/* call recursively to retrieve the channel info */
+				err = check_input_term(state, d->baSourceID[0], term);
+				if (err < 0)
+					return err;
+
+				term->type = d->bDescriptorSubtype << 16; /* virtual type */
+				term->id = id;
+				term->name = 0; /* TODO: UAC3 Class-specific strings */
+
+				return 0;
+			}
 			default:
 				return -ENODEV;
 			}
@@ -2180,6 +2197,11 @@ struct procunit_info {
 	struct procunit_value_info *values;
 };
 
+static struct procunit_value_info undefined_proc_info[] = {
+	{ 0x00, "Control Undefined", 0 },
+	{ 0 }
+};
+
 static struct procunit_value_info updown_proc_info[] = {
 	{ UAC_UD_ENABLE, "Switch", USB_MIXER_BOOLEAN },
 	{ UAC_UD_MODE_SELECT, "Mode Select", USB_MIXER_U8, 1 },
@@ -2228,6 +2250,23 @@ static struct procunit_info procunits[] = {
 	{ UAC_PROCESS_DYN_RANGE_COMP, "DCR", dcr_proc_info },
 	{ 0 },
 };
+
+static struct procunit_value_info uac3_updown_proc_info[] = {
+	{ UAC3_UD_MODE_SELECT, "Mode Select", USB_MIXER_U8, 1 },
+	{ 0 }
+};
+static struct procunit_value_info uac3_stereo_ext_proc_info[] = {
+	{ UAC3_EXT_WIDTH_CONTROL, "Width Control", USB_MIXER_U8 },
+	{ 0 }
+};
+
+static struct procunit_info uac3_procunits[] = {
+	{ UAC3_PROCESS_UP_DOWNMIX, "Up Down", uac3_updown_proc_info },
+	{ UAC3_PROCESS_STEREO_EXTENDER, "3D Stereo Extender", uac3_stereo_ext_proc_info },
+	{ UAC3_PROCESS_MULTI_FUNCTION, "Multi-Function", undefined_proc_info },
+	{ 0 },
+};
+
 /*
  * predefined data for extension units
  */
@@ -2388,8 +2427,16 @@ static int build_audio_procunit(struct mixer_build *state, int unitid,
 static int parse_audio_processing_unit(struct mixer_build *state, int unitid,
 				       void *raw_desc)
 {
-	return build_audio_procunit(state, unitid, raw_desc,
-				    procunits, "Processing Unit");
+	switch (state->mixer->protocol) {
+	case UAC_VERSION_1:
+	case UAC_VERSION_2:
+	default:
+		return build_audio_procunit(state, unitid, raw_desc,
+				procunits, "Processing Unit");
+	case UAC_VERSION_3:
+		return build_audio_procunit(state, unitid, raw_desc,
+				uac3_procunits, "Processing Unit");
+	}
 }
 
 static int parse_audio_extension_unit(struct mixer_build *state, int unitid,
-- 
cgit v1.2.3


From 31048d7aedf31bf0f69c54a662944632f29d82f2 Mon Sep 17 00:00:00 2001
From: Stefan Baranoff <sbaranoff@gmail.com>
Date: Sun, 15 Jul 2018 11:36:37 -0400
Subject: tcp: Fix broken repair socket window probe patch

Correct previous bad attempt at allowing sockets to come out of TCP
repair without sending window probes. To avoid changing size of
the repair variable in struct tcp_sock, this lets the decision for
sending probes or not to be made when coming out of repair by
introducing two ways to turn it off.

v2:
* Remove erroneous comment; defines now make behavior clear

Fixes: 70b7ff130224 ("tcp: allow user to create repair socket without window probes")
Signed-off-by: Stefan Baranoff <sbaranoff@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h |  4 ++++
 net/ipv4/tcp.c           | 13 +++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 29eb659aa77a..e3f6ed8a7064 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -127,6 +127,10 @@ enum {
 
 #define TCP_CM_INQ		TCP_INQ
 
+#define TCP_REPAIR_ON		1
+#define TCP_REPAIR_OFF		0
+#define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */
+
 struct tcp_repair_opt {
 	__u32	opt_code;
 	__u32	opt_val;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8e5e2ca9ab1b..ec2186e3087f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2823,16 +2823,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_REPAIR:
 		if (!tcp_can_repair_sock(sk))
 			err = -EPERM;
-		/* 1 for normal repair, 2 for no window probes */
-		else if (val == 1 || val == 2) {
-			tp->repair = val;
+		else if (val == TCP_REPAIR_ON) {
+			tp->repair = 1;
 			sk->sk_reuse = SK_FORCE_REUSE;
 			tp->repair_queue = TCP_NO_QUEUE;
-		} else if (val == 0) {
+		} else if (val == TCP_REPAIR_OFF) {
+			tp->repair = 0;
+			sk->sk_reuse = SK_NO_REUSE;
+			tcp_send_window_probe(sk);
+		} else if (val == TCP_REPAIR_OFF_NO_WP) {
 			tp->repair = 0;
 			sk->sk_reuse = SK_NO_REUSE;
-			if (tp->repair == 1)
-				tcp_send_window_probe(sk);
 		} else
 			err = -EINVAL;
 
-- 
cgit v1.2.3


From b875a5a529bedf73532000f4e2496c00a00f4765 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 13 Jul 2018 16:13:43 +0200
Subject: input: add MT_TOOL_DIAL

A dial is a tool you place on a multitouch surface which reports its
orientation or a relative angle of rotation when rotating its knob.

Some examples are the Dell Totem (on the Canvas 27"), the Microsoft Dial,
or the Griffin Powermate, though the later can't be put on a touch surface.

We give some extra space to account for other types of fingers if we need
(MT_TOOL_THUMB)

Slightly change the documentation to not make it mandatory to update each
MT_TOOL we add.

Reviewed-by: Peter Hutterer <peter.hutterer@who-t.net>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/input/multi-touch-protocol.rst | 12 ++++++------
 include/uapi/linux/input.h                   |  9 +++++----
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/input/multi-touch-protocol.rst b/Documentation/input/multi-touch-protocol.rst
index b51751a0cd5d..6be70342e709 100644
--- a/Documentation/input/multi-touch-protocol.rst
+++ b/Documentation/input/multi-touch-protocol.rst
@@ -310,12 +310,12 @@ ABS_MT_TOOL_Y
 ABS_MT_TOOL_TYPE
     The type of approaching tool. A lot of kernel drivers cannot distinguish
     between different tool types, such as a finger or a pen. In such cases, the
-    event should be omitted. The protocol currently supports MT_TOOL_FINGER,
-    MT_TOOL_PEN, and MT_TOOL_PALM [#f2]_. For type B devices, this event is
-    handled by input core; drivers should instead use
-    input_mt_report_slot_state(). A contact's ABS_MT_TOOL_TYPE may change over
-    time while still touching the device, because the firmware may not be able
-    to determine which tool is being used when it first appears.
+    event should be omitted. The protocol currently mainly supports
+    MT_TOOL_FINGER, MT_TOOL_PEN, and MT_TOOL_PALM [#f2]_.
+    For type B devices, this event is handled by input core; drivers should
+    instead use input_mt_report_slot_state(). A contact's ABS_MT_TOOL_TYPE may
+    change over time while still touching the device, because the firmware may
+    not be able to determine which tool is being used when it first appears.
 
 ABS_MT_BLOB_ID
     The BLOB_ID groups several packets together into one arbitrarily shaped
diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index 7288a7c573cc..fb78f6f500f3 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -270,10 +270,11 @@ struct input_mask {
 /*
  * MT_TOOL types
  */
-#define MT_TOOL_FINGER		0
-#define MT_TOOL_PEN		1
-#define MT_TOOL_PALM		2
-#define MT_TOOL_MAX		2
+#define MT_TOOL_FINGER		0x00
+#define MT_TOOL_PEN		0x01
+#define MT_TOOL_PALM		0x02
+#define MT_TOOL_DIAL		0x0a
+#define MT_TOOL_MAX		0x0f
 
 /*
  * Values describing the status of a force-feedback effect
-- 
cgit v1.2.3


From 9ba546c01976a426292af99e682a557075d6c010 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Jul 2018 15:48:46 +0200
Subject: aio: don't expose __aio_sigset in uapi

glibc uses a different defintion of sigset_t than the kernel does,
and the current version would pull in both.  To fix this just do not
expose the type at all - this somewhat mirrors pselect() where we
do not even have a type for the magic sigmask argument, but just
use pointer arithmetics.

Fixes: 7a074e96 ("aio: implement io_pgetevents")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Adrian Reber <adrian@lisas.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c                     | 5 +++++
 include/linux/syscalls.h     | 1 +
 include/uapi/linux/aio_abi.h | 6 ------
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/aio.c b/fs/aio.c
index e1d20124ec0e..b1a42e45698b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2042,6 +2042,11 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 	return ret;
 }
 
+struct __aio_sigset {
+	const sigset_t __user	*sigmask;
+	size_t		sigsetsize;
+};
+
 SYSCALL_DEFINE6(io_pgetevents,
 		aio_context_t, ctx_id,
 		long, min_nr,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 73810808cdf2..b06b5eeda8e8 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -11,6 +11,7 @@
 #ifndef _LINUX_SYSCALLS_H
 #define _LINUX_SYSCALLS_H
 
+struct __aio_sigset;
 struct epoll_event;
 struct iattr;
 struct inode;
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index d00221345c19..ce43d340f010 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -29,7 +29,6 @@
 
 #include <linux/types.h>
 #include <linux/fs.h>
-#include <linux/signal.h>
 #include <asm/byteorder.h>
 
 typedef __kernel_ulong_t aio_context_t;
@@ -108,10 +107,5 @@ struct iocb {
 #undef IFBIG
 #undef IFLITTLE
 
-struct __aio_sigset {
-	const sigset_t __user	*sigmask;
-	size_t		sigsetsize;
-};
-
 #endif /* __LINUX__AIO_ABI_H */
 
-- 
cgit v1.2.3


From 7d25f8851a2c03319bfa8e56bb40bde2c4621392 Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Thu, 12 Jul 2018 17:48:06 +0200
Subject: netfilter: nft_socket: Expose socket mark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  4 +++-
 net/netfilter/nft_socket.c               | 11 +++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 89438e68dc03..f466860bcf75 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -921,10 +921,12 @@ enum nft_socket_attributes {
 /*
  * enum nft_socket_keys - nf_tables socket expression keys
  *
- * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option_
+ * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option
+ * @NFT_SOCKET_MARK: Value of the socket mark
  */
 enum nft_socket_keys {
 	NFT_SOCKET_TRANSPARENT,
+	NFT_SOCKET_MARK,
 	__NFT_SOCKET_MAX
 };
 #define NFT_SOCKET_MAX	(__NFT_SOCKET_MAX - 1)
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 622ac2012a40..d7f3776dfd71 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -54,6 +54,14 @@ static void nft_socket_eval(const struct nft_expr *expr,
 	case NFT_SOCKET_TRANSPARENT:
 		nft_reg_store8(dest, inet_sk_transparent(sk));
 		break;
+	case NFT_SOCKET_MARK:
+		if (sk_fullsock(sk)) {
+			*dest = sk->sk_mark;
+		} else {
+			regs->verdict.code = NFT_BREAK;
+			return;
+		}
+		break;
 	default:
 		WARN_ON(1);
 		regs->verdict.code = NFT_BREAK;
@@ -91,6 +99,9 @@ static int nft_socket_init(const struct nft_ctx *ctx,
 	case NFT_SOCKET_TRANSPARENT:
 		len = sizeof(u8);
 		break;
+	case NFT_SOCKET_MARK:
+		len = sizeof(u32);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From 24c458c485c87eef97e91d2e180f222555528b11 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Sat, 14 Jul 2018 16:50:59 +0200
Subject: netfilter: nf_osf: add missing definitions to header file

Add missing definitions from nf_osf.h in order to extract Passive OS
fingerprint infrastructure from xt_osf.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_osf.h | 11 +++++++++++
 include/uapi/linux/netfilter/xt_osf.h | 10 ++--------
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
index 8f2f2f403183..3738116b2bbe 100644
--- a/include/uapi/linux/netfilter/nf_osf.h
+++ b/include/uapi/linux/netfilter/nf_osf.h
@@ -16,9 +16,14 @@
 
 #define NF_OSF_TTL_TRUE			0	/* True ip and fingerprint TTL comparison */
 
+/* Check if ip TTL is less than fingerprint one */
+#define NF_OSF_TTL_LESS			1
+
 /* Do not compare ip and fingerprint TTL at all */
 #define NF_OSF_TTL_NOCHECK		2
 
+#define NF_OSF_FLAGMASK		(NF_OSF_GENRE | NF_OSF_TTL | \
+				 NF_OSF_LOG | NF_OSF_INVERT)
 /* Wildcard MSS (kind of).
  * It is used to implement a state machine for the different wildcard values
  * of the MSS and window sizes.
@@ -83,4 +88,10 @@ enum iana_options {
 	OSFOPT_EMPTY = 255,
 };
 
+enum nf_osf_attr_type {
+	OSF_ATTR_UNSPEC,
+	OSF_ATTR_FINGER,
+	OSF_ATTR_MAX,
+};
+
 #endif /* _NF_OSF_H */
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index 72956eceeb09..b189007f4f28 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -37,8 +37,7 @@
 
 #define XT_OSF_TTL_TRUE		NF_OSF_TTL_TRUE
 #define XT_OSF_TTL_NOCHECK	NF_OSF_TTL_NOCHECK
-
-#define XT_OSF_TTL_LESS	1	/* Check if ip TTL is less than fingerprint one */
+#define XT_OSF_TTL_LESS		NF_OSF_TTL_LESS
 
 #define xt_osf_wc		nf_osf_wc
 #define xt_osf_opt		nf_osf_opt
@@ -47,6 +46,7 @@
 #define xt_osf_finger		nf_osf_finger
 #define xt_osf_nlmsg		nf_osf_nlmsg
 
+#define xt_osf_attr_type	nf_osf_attr_type
 /*
  * Add/remove fingerprint from the kernel.
  */
@@ -56,10 +56,4 @@ enum xt_osf_msg_types {
 	OSF_MSG_MAX,
 };
 
-enum xt_osf_attr_type {
-	OSF_ATTR_UNSPEC,
-	OSF_ATTR_FINGER,
-	OSF_ATTR_MAX,
-};
-
 #endif				/* _XT_OSF_H */
-- 
cgit v1.2.3


From dba31ee759417ef1a952e929524b0cca1751c036 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.vnet.ibm.com>
Date: Mon, 4 Jun 2018 16:54:55 -0400
Subject: ima: Differentiate auditing policy rules from "audit" actions

The AUDIT_INTEGRITY_RULE is used for auditing IMA policy rules and
the IMA "audit" policy action.  This patch defines
AUDIT_INTEGRITY_POLICY_RULE to reflect the IMA policy rules.

Since we defined a new message type we can now also pass the
audit_context and get an associated SYSCALL record. This now produces
the following records when parsing IMA policy's rules:

type=UNKNOWN[1807] msg=audit(1527888965.738:320): action=audit \
  func=MMAP_CHECK mask=MAY_EXEC res=1
type=UNKNOWN[1807] msg=audit(1527888965.738:320): action=audit \
  func=FILE_CHECK mask=MAY_READ res=1
type=SYSCALL msg=audit(1527888965.738:320): arch=c000003e syscall=1 \
  success=yes exit=17 a0=1 a1=55bcfcca9030 a2=11 a3=7fcc1b55fb38 \
  items=0 ppid=1567 pid=1601 auid=0 uid=0 gid=0 euid=0 suid=0 \
  fsuid=0 egid=0 sgid=0 fsgid=0 tty=tty2 ses=2 comm="echo" \
  exe="/usr/bin/echo" \
  subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)

Signed-off-by: Stefan Berger <stefanb@linux.vnet.ibm.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 include/uapi/linux/audit.h          | 1 +
 security/integrity/ima/ima_policy.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index c35aee9ad4a6..cf2bad8d7873 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -148,6 +148,7 @@
 #define AUDIT_INTEGRITY_PCR	    1804 /* PCR invalidation msgs */
 #define AUDIT_INTEGRITY_RULE	    1805 /* policy rule */
 #define AUDIT_INTEGRITY_EVM_XATTR   1806 /* New EVM-covered xattr */
+#define AUDIT_INTEGRITY_POLICY_RULE 1807 /* IMA policy rules */
 
 #define AUDIT_KERNEL		2000	/* Asynchronous audit record. NOT A REQUEST. */
 
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 0178bdaa40aa..8c9499867c91 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -681,8 +681,8 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
 	bool uid_token;
 	int result = 0;
 
-	ab = integrity_audit_log_start(NULL, GFP_KERNEL,
-				       AUDIT_INTEGRITY_RULE);
+	ab = integrity_audit_log_start(audit_context(), GFP_KERNEL,
+				       AUDIT_INTEGRITY_POLICY_RULE);
 
 	entry->uid = INVALID_UID;
 	entry->fowner = INVALID_UID;
-- 
cgit v1.2.3


From c2e00e31087e58f6c49b90b4702fc3df4fad6a83 Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Date: Thu, 19 Jul 2018 10:32:19 +0200
Subject: pci-epf-test/pci_endpoint_test: Add MSI-X support

Add MSI-X support and update driver documentation accordingly.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 Documentation/PCI/endpoint/pci-endpoint.txt       |  4 ++--
 Documentation/PCI/endpoint/pci-test-function.txt  |  4 +++-
 Documentation/PCI/endpoint/pci-test-howto.txt     | 22 ++++++++++++++---
 Documentation/ioctl/ioctl-number.txt              |  1 +
 Documentation/misc-devices/pci-endpoint-test.txt  |  3 +++
 drivers/misc/pci_endpoint_test.c                  | 29 ++++++++++++++++-------
 drivers/pci/controller/dwc/pcie-designware-plat.c |  1 +
 drivers/pci/endpoint/functions/pci-epf-test.c     | 29 +++++++++++++++++++++--
 include/linux/pci-epc.h                           |  1 +
 include/uapi/linux/pcitest.h                      |  1 +
 10 files changed, 79 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/PCI/endpoint/pci-endpoint.txt b/Documentation/PCI/endpoint/pci-endpoint.txt
index 9b1d66829290..e86a96b66a6a 100644
--- a/Documentation/PCI/endpoint/pci-endpoint.txt
+++ b/Documentation/PCI/endpoint/pci-endpoint.txt
@@ -44,7 +44,7 @@ by the PCI controller driver.
 	 * clear_bar: ops to reset the BAR
 	 * alloc_addr_space: ops to allocate in PCI controller address space
 	 * free_addr_space: ops to free the allocated address space
-	 * raise_irq: ops to raise a legacy or MSI interrupt
+	 * raise_irq: ops to raise a legacy, MSI or MSI-X interrupt
 	 * start: ops to start the PCI link
 	 * stop: ops to stop the PCI link
 
@@ -96,7 +96,7 @@ by the PCI endpoint function driver.
 *) pci_epc_raise_irq()
 
    The PCI endpoint function driver should use pci_epc_raise_irq() to raise
-   Legacy Interrupt or MSI Interrupt.
+   Legacy Interrupt, MSI or MSI-X Interrupt.
 
 *) pci_epc_mem_alloc_addr()
 
diff --git a/Documentation/PCI/endpoint/pci-test-function.txt b/Documentation/PCI/endpoint/pci-test-function.txt
index bf4b5cf6fee6..5916f1f592bb 100644
--- a/Documentation/PCI/endpoint/pci-test-function.txt
+++ b/Documentation/PCI/endpoint/pci-test-function.txt
@@ -36,7 +36,7 @@ that the endpoint device must perform.
 Bitfield Description:
   Bit 0		: raise legacy IRQ
   Bit 1		: raise MSI IRQ
-  Bit 2		: raise MSI-X IRQ (reserved for future implementation)
+  Bit 2		: raise MSI-X IRQ
   Bit 3		: read command (read data from RC buffer)
   Bit 4		: write command (write data to RC buffer)
   Bit 5		: copy command (copy data from one RC buffer to another
@@ -75,6 +75,7 @@ for the READ/WRITE/COPY and raise IRQ (Legacy/MSI) commands.
 Possible types:
  - Legacy	: 0
  - MSI		: 1
+ - MSI-X	: 2
 
 *) PCI_ENDPOINT_TEST_IRQ_NUMBER
 
@@ -83,3 +84,4 @@ This register contains the triggered ID interrupt.
 Admissible values:
  - Legacy	: 0
  - MSI		: [1 .. 32]
+ - MSI-X	: [1 .. 2048]
diff --git a/Documentation/PCI/endpoint/pci-test-howto.txt b/Documentation/PCI/endpoint/pci-test-howto.txt
index 75f48c3bb191..65f1a137e35c 100644
--- a/Documentation/PCI/endpoint/pci-test-howto.txt
+++ b/Documentation/PCI/endpoint/pci-test-howto.txt
@@ -45,9 +45,9 @@ The PCI endpoint framework populates the directory with the following
 configurable fields.
 
 	# ls functions/pci_epf_test/func1
-	  baseclass_code	interrupt_pin	revid		subsys_vendor_id
-	  cache_line_size	msi_interrupts	subclass_code	vendorid
-	  deviceid          	progif_code	subsys_id
+	  baseclass_code	interrupt_pin	progif_code	subsys_id
+	  cache_line_size	msi_interrupts	revid		subsys_vendorid
+	  deviceid          	msix_interrupts	subclass_code	vendorid
 
 The PCI endpoint function driver populates these entries with default values
 when the device is bound to the driver. The pci-epf-test driver populates
@@ -67,6 +67,7 @@ device, the following commands can be used.
 	# echo 0x104c > functions/pci_epf_test/func1/vendorid
 	# echo 0xb500 > functions/pci_epf_test/func1/deviceid
 	# echo 16 > functions/pci_epf_test/func1/msi_interrupts
+	# echo 8 > functions/pci_epf_test/func1/msix_interrupts
 
 1.5 Binding pci-epf-test Device to EP Controller
 
@@ -153,6 +154,21 @@ following commands.
 	MSI30:          NOT OKAY
 	MSI31:          NOT OKAY
 	MSI32:          NOT OKAY
+	MSIX1:          OKAY
+	MSIX2:          OKAY
+	MSIX3:          OKAY
+	MSIX4:          OKAY
+	MSIX5:          OKAY
+	MSIX6:          OKAY
+	MSIX7:          OKAY
+	MSIX8:          OKAY
+	MSIX9:          NOT OKAY
+	MSIX10:         NOT OKAY
+	MSIX11:         NOT OKAY
+	MSIX12:         NOT OKAY
+	MSIX13:         NOT OKAY
+	[...]
+	MSIX2048:       NOT OKAY
 
 	Read Tests
 
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 480c8609dc58..65259d459fd1 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -166,6 +166,7 @@ Code  Seq#(hex)	Include File		Comments
 'P'	all	linux/soundcard.h	conflict!
 'P'	60-6F	sound/sscape_ioctl.h	conflict!
 'P'	00-0F	drivers/usb/class/usblp.c	conflict!
+'P'	01-07	drivers/misc/pci_endpoint_test.c	conflict!
 'Q'	all	linux/soundcard.h
 'R'	00-1F	linux/random.h		conflict!
 'R'	01	linux/rfkill.h		conflict!
diff --git a/Documentation/misc-devices/pci-endpoint-test.txt b/Documentation/misc-devices/pci-endpoint-test.txt
index 4ebc3594b32c..fdfa0f66d3d0 100644
--- a/Documentation/misc-devices/pci-endpoint-test.txt
+++ b/Documentation/misc-devices/pci-endpoint-test.txt
@@ -10,6 +10,7 @@ The PCI driver for the test device performs the following tests
 	*) verifying addresses programmed in BAR
 	*) raise legacy IRQ
 	*) raise MSI IRQ
+	*) raise MSI-X IRQ
 	*) read data
 	*) write data
 	*) copy data
@@ -25,6 +26,8 @@ ioctl
  PCITEST_LEGACY_IRQ: Tests legacy IRQ
  PCITEST_MSI: Tests message signalled interrupts. The MSI number
 	      to be tested should be passed as argument.
+ PCITEST_MSIX: Tests message signalled interrupts. The MSI-X number
+	      to be tested should be passed as argument.
  PCITEST_WRITE: Perform write tests. The size of the buffer should be passed
 		as argument.
  PCITEST_READ: Perform read tests. The size of the buffer should be passed
diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 349794cbe1f3..f4fef108caff 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -39,13 +39,14 @@
 
 #define IRQ_TYPE_LEGACY				0
 #define IRQ_TYPE_MSI				1
+#define IRQ_TYPE_MSIX				2
 
 #define PCI_ENDPOINT_TEST_MAGIC			0x0
 
 #define PCI_ENDPOINT_TEST_COMMAND		0x4
 #define COMMAND_RAISE_LEGACY_IRQ		BIT(0)
 #define COMMAND_RAISE_MSI_IRQ			BIT(1)
-/* BIT(2) is reserved for raising MSI-X IRQ command */
+#define COMMAND_RAISE_MSIX_IRQ			BIT(2)
 #define COMMAND_READ				BIT(3)
 #define COMMAND_WRITE				BIT(4)
 #define COMMAND_COPY				BIT(5)
@@ -84,7 +85,7 @@ MODULE_PARM_DESC(no_msi, "Disable MSI interrupt in pci_endpoint_test");
 
 static int irq_type = IRQ_TYPE_MSI;
 module_param(irq_type, int, 0444);
-MODULE_PARM_DESC(irq_type, "IRQ mode selection in pci_endpoint_test (0 - Legacy, 1 - MSI)");
+MODULE_PARM_DESC(irq_type, "IRQ mode selection in pci_endpoint_test (0 - Legacy, 1 - MSI, 2 - MSI-X)");
 
 enum pci_barno {
 	BAR_0,
@@ -202,16 +203,18 @@ static bool pci_endpoint_test_legacy_irq(struct pci_endpoint_test *test)
 }
 
 static bool pci_endpoint_test_msi_irq(struct pci_endpoint_test *test,
-				      u8 msi_num)
+				       u16 msi_num, bool msix)
 {
 	u32 val;
 	struct pci_dev *pdev = test->pdev;
 
 	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_TYPE,
-				 IRQ_TYPE_MSI);
+				 msix == false ? IRQ_TYPE_MSI :
+				 IRQ_TYPE_MSIX);
 	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_NUMBER, msi_num);
 	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND,
-				 COMMAND_RAISE_MSI_IRQ);
+				 msix == false ? COMMAND_RAISE_MSI_IRQ :
+				 COMMAND_RAISE_MSIX_IRQ);
 	val = wait_for_completion_timeout(&test->irq_raised,
 					  msecs_to_jiffies(1000));
 	if (!val)
@@ -456,7 +459,8 @@ static long pci_endpoint_test_ioctl(struct file *file, unsigned int cmd,
 		ret = pci_endpoint_test_legacy_irq(test);
 		break;
 	case PCITEST_MSI:
-		ret = pci_endpoint_test_msi_irq(test, arg);
+	case PCITEST_MSIX:
+		ret = pci_endpoint_test_msi_irq(test, arg, cmd == PCITEST_MSIX);
 		break;
 	case PCITEST_WRITE:
 		ret = pci_endpoint_test_write(test, arg);
@@ -542,6 +546,12 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 			dev_err(dev, "Failed to get MSI interrupts\n");
 		test->num_irqs = irq;
 		break;
+	case IRQ_TYPE_MSIX:
+		irq = pci_alloc_irq_vectors(pdev, 1, 2048, PCI_IRQ_MSIX);
+		if (irq < 0)
+			dev_err(dev, "Failed to get MSI-X interrupts\n");
+		test->num_irqs = irq;
+		break;
 	default:
 		dev_err(dev, "Invalid IRQ type selected\n");
 	}
@@ -558,8 +568,9 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 				       pci_endpoint_test_irqhandler,
 				       IRQF_SHARED, DRV_MODULE_NAME, test);
 		if (err)
-			dev_err(dev, "failed to request IRQ %d for MSI %d\n",
-				pci_irq_vector(pdev, i), i + 1);
+			dev_err(dev, "Failed to request IRQ %d for MSI%s %d\n",
+				pci_irq_vector(pdev, i),
+				irq_type == IRQ_TYPE_MSIX ? "-X" : "", i + 1);
 	}
 
 	for (bar = BAR_0; bar <= BAR_5; bar++) {
@@ -625,6 +636,7 @@ err_iounmap:
 
 err_disable_msi:
 	pci_disable_msi(pdev);
+	pci_disable_msix(pdev);
 	pci_release_regions(pdev);
 
 err_disable_pdev:
@@ -656,6 +668,7 @@ static void pci_endpoint_test_remove(struct pci_dev *pdev)
 	for (i = 0; i < test->num_irqs; i++)
 		devm_free_irq(&pdev->dev, pci_irq_vector(pdev, i), test);
 	pci_disable_msi(pdev);
+	pci_disable_msix(pdev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 }
diff --git a/drivers/pci/controller/dwc/pcie-designware-plat.c b/drivers/pci/controller/dwc/pcie-designware-plat.c
index 3f8a3aa3a91e..c12bf794d69c 100644
--- a/drivers/pci/controller/dwc/pcie-designware-plat.c
+++ b/drivers/pci/controller/dwc/pcie-designware-plat.c
@@ -77,6 +77,7 @@ static void dw_plat_pcie_ep_init(struct dw_pcie_ep *ep)
 		dw_pcie_ep_reset_bar(pci, bar);
 
 	epc->features |= EPC_FEATURE_NO_LINKUP_NOTIFIER;
+	epc->features |= EPC_FEATURE_MSIX_AVAILABLE;
 }
 
 static int dw_plat_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index db4b23672004..3e86fa3c7da3 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -20,10 +20,11 @@
 
 #define IRQ_TYPE_LEGACY			0
 #define IRQ_TYPE_MSI			1
+#define IRQ_TYPE_MSIX			2
 
 #define COMMAND_RAISE_LEGACY_IRQ	BIT(0)
 #define COMMAND_RAISE_MSI_IRQ		BIT(1)
-/* BIT(2) is reserved for raising MSI-X IRQ command */
+#define COMMAND_RAISE_MSIX_IRQ		BIT(2)
 #define COMMAND_READ			BIT(3)
 #define COMMAND_WRITE			BIT(4)
 #define COMMAND_COPY			BIT(5)
@@ -47,6 +48,7 @@ struct pci_epf_test {
 	struct pci_epf		*epf;
 	enum pci_barno		test_reg_bar;
 	bool			linkup_notifier;
+	bool			msix_available;
 	struct delayed_work	cmd_handler;
 };
 
@@ -266,6 +268,9 @@ static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test, u8 irq_type,
 	case IRQ_TYPE_MSI:
 		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSI, irq);
 		break;
+	case IRQ_TYPE_MSIX:
+		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSIX, irq);
+		break;
 	default:
 		dev_err(dev, "Failed to raise IRQ, unknown type\n");
 		break;
@@ -292,7 +297,7 @@ static void pci_epf_test_cmd_handler(struct work_struct *work)
 	reg->command = 0;
 	reg->status = 0;
 
-	if (reg->irq_type > IRQ_TYPE_MSI) {
+	if (reg->irq_type > IRQ_TYPE_MSIX) {
 		dev_err(dev, "Failed to detect IRQ type\n");
 		goto reset_handler;
 	}
@@ -346,6 +351,16 @@ static void pci_epf_test_cmd_handler(struct work_struct *work)
 		goto reset_handler;
 	}
 
+	if (command & COMMAND_RAISE_MSIX_IRQ) {
+		count = pci_epc_get_msix(epc, epf->func_no);
+		if (reg->irq_number > count || count <= 0)
+			goto reset_handler;
+		reg->status = STATUS_IRQ_RAISED;
+		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSIX,
+				  reg->irq_number);
+		goto reset_handler;
+	}
+
 reset_handler:
 	queue_delayed_work(kpcitest_workqueue, &epf_test->cmd_handler,
 			   msecs_to_jiffies(1));
@@ -459,6 +474,8 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 	else
 		epf_test->linkup_notifier = true;
 
+	epf_test->msix_available = epc->features & EPC_FEATURE_MSIX_AVAILABLE;
+
 	epf_test->test_reg_bar = EPC_FEATURE_GET_BAR(epc->features);
 
 	ret = pci_epc_write_header(epc, epf->func_no, header);
@@ -481,6 +498,14 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 		return ret;
 	}
 
+	if (epf_test->msix_available) {
+		ret = pci_epc_set_msix(epc, epf->func_no, epf->msix_interrupts);
+		if (ret) {
+			dev_err(dev, "MSI-X configuration failed\n");
+			return ret;
+		}
+	}
+
 	if (!epf_test->linkup_notifier)
 		queue_work(kpcitest_workqueue, &epf_test->cmd_handler.work);
 
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index bb2395b56f13..37dab8116901 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -102,6 +102,7 @@ struct pci_epc {
 
 #define EPC_FEATURE_NO_LINKUP_NOTIFIER		BIT(0)
 #define EPC_FEATURE_BAR_MASK			(BIT(1) | BIT(2) | BIT(3))
+#define EPC_FEATURE_MSIX_AVAILABLE		BIT(4)
 #define EPC_FEATURE_SET_BAR(features, bar)	\
 		(features |= (EPC_FEATURE_BAR_MASK & (bar << 1)))
 #define EPC_FEATURE_GET_BAR(features)		\
diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h
index 953cf036cb26..d746fb159dcd 100644
--- a/include/uapi/linux/pcitest.h
+++ b/include/uapi/linux/pcitest.h
@@ -16,5 +16,6 @@
 #define PCITEST_WRITE		_IOW('P', 0x4, unsigned long)
 #define PCITEST_READ		_IOW('P', 0x5, unsigned long)
 #define PCITEST_COPY		_IOW('P', 0x6, unsigned long)
+#define PCITEST_MSIX		_IOW('P', 0x7, int)
 
 #endif /* __UAPI_LINUX_PCITEST_H */
-- 
cgit v1.2.3


From e03327122e2c8e6ae4565ef5b3d3cbe4364546a1 Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Date: Thu, 19 Jul 2018 10:32:20 +0200
Subject: pci_endpoint_test: Add 2 ioctl commands

Add MSI-X support and update driver documentation accordingly.

Add 2 new IOCTL commands:
 - Allow to reconfigure driver IRQ type in runtime.
 - Allow to retrieve current driver IRQ type configured.

Add IRQ type validation before executing the READ/WRITE/COPY tests.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 Documentation/ioctl/ioctl-number.txt             |   2 +-
 Documentation/misc-devices/pci-endpoint-test.txt |   3 +
 drivers/misc/pci_endpoint_test.c                 | 206 +++++++++++++++++------
 include/uapi/linux/pcitest.h                     |   2 +
 4 files changed, 165 insertions(+), 48 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 65259d459fd1..c15c4f3bdd82 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -166,7 +166,7 @@ Code  Seq#(hex)	Include File		Comments
 'P'	all	linux/soundcard.h	conflict!
 'P'	60-6F	sound/sscape_ioctl.h	conflict!
 'P'	00-0F	drivers/usb/class/usblp.c	conflict!
-'P'	01-07	drivers/misc/pci_endpoint_test.c	conflict!
+'P'	01-09	drivers/misc/pci_endpoint_test.c	conflict!
 'Q'	all	linux/soundcard.h
 'R'	00-1F	linux/random.h		conflict!
 'R'	01	linux/rfkill.h		conflict!
diff --git a/Documentation/misc-devices/pci-endpoint-test.txt b/Documentation/misc-devices/pci-endpoint-test.txt
index fdfa0f66d3d0..58ccca4416b1 100644
--- a/Documentation/misc-devices/pci-endpoint-test.txt
+++ b/Documentation/misc-devices/pci-endpoint-test.txt
@@ -28,6 +28,9 @@ ioctl
 	      to be tested should be passed as argument.
  PCITEST_MSIX: Tests message signalled interrupts. The MSI-X number
 	      to be tested should be passed as argument.
+ PCITEST_SET_IRQTYPE: Changes driver IRQ type configuration. The IRQ type
+	      should be passed as argument (0: Legacy, 1:MSI, 2:MSI-X).
+ PCITEST_GET_IRQTYPE: Gets driver IRQ type configuration.
  PCITEST_WRITE: Perform write tests. The size of the buffer should be passed
 		as argument.
  PCITEST_READ: Perform read tests. The size of the buffer should be passed
diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index f4fef108caff..896e2df9400f 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -37,6 +37,7 @@
 
 #define DRV_MODULE_NAME				"pci-endpoint-test"
 
+#define IRQ_TYPE_UNDEFINED			-1
 #define IRQ_TYPE_LEGACY				0
 #define IRQ_TYPE_MSI				1
 #define IRQ_TYPE_MSIX				2
@@ -157,6 +158,100 @@ static irqreturn_t pci_endpoint_test_irqhandler(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static void pci_endpoint_test_free_irq_vectors(struct pci_endpoint_test *test)
+{
+	struct pci_dev *pdev = test->pdev;
+
+	pci_free_irq_vectors(pdev);
+}
+
+static bool pci_endpoint_test_alloc_irq_vectors(struct pci_endpoint_test *test,
+						int type)
+{
+	int irq = -1;
+	struct pci_dev *pdev = test->pdev;
+	struct device *dev = &pdev->dev;
+	bool res = true;
+
+	switch (type) {
+	case IRQ_TYPE_LEGACY:
+		irq = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_LEGACY);
+		if (irq < 0)
+			dev_err(dev, "Failed to get Legacy interrupt\n");
+		break;
+	case IRQ_TYPE_MSI:
+		irq = pci_alloc_irq_vectors(pdev, 1, 32, PCI_IRQ_MSI);
+		if (irq < 0)
+			dev_err(dev, "Failed to get MSI interrupts\n");
+		break;
+	case IRQ_TYPE_MSIX:
+		irq = pci_alloc_irq_vectors(pdev, 1, 2048, PCI_IRQ_MSIX);
+		if (irq < 0)
+			dev_err(dev, "Failed to get MSI-X interrupts\n");
+		break;
+	default:
+		dev_err(dev, "Invalid IRQ type selected\n");
+	}
+
+	if (irq < 0) {
+		irq = 0;
+		res = false;
+	}
+	test->num_irqs = irq;
+
+	return res;
+}
+
+static void pci_endpoint_test_release_irq(struct pci_endpoint_test *test)
+{
+	int i;
+	struct pci_dev *pdev = test->pdev;
+	struct device *dev = &pdev->dev;
+
+	for (i = 0; i < test->num_irqs; i++)
+		devm_free_irq(dev, pci_irq_vector(pdev, i), test);
+
+	test->num_irqs = 0;
+}
+
+static bool pci_endpoint_test_request_irq(struct pci_endpoint_test *test)
+{
+	int i;
+	int err;
+	struct pci_dev *pdev = test->pdev;
+	struct device *dev = &pdev->dev;
+
+	for (i = 0; i < test->num_irqs; i++) {
+		err = devm_request_irq(dev, pci_irq_vector(pdev, i),
+				       pci_endpoint_test_irqhandler,
+				       IRQF_SHARED, DRV_MODULE_NAME, test);
+		if (err)
+			goto fail;
+	}
+
+	return true;
+
+fail:
+	switch (irq_type) {
+	case IRQ_TYPE_LEGACY:
+		dev_err(dev, "Failed to request IRQ %d for Legacy\n",
+			pci_irq_vector(pdev, i));
+		break;
+	case IRQ_TYPE_MSI:
+		dev_err(dev, "Failed to request IRQ %d for MSI %d\n",
+			pci_irq_vector(pdev, i),
+			i + 1);
+		break;
+	case IRQ_TYPE_MSIX:
+		dev_err(dev, "Failed to request IRQ %d for MSI-X %d\n",
+			pci_irq_vector(pdev, i),
+			i + 1);
+		break;
+	}
+
+	return false;
+}
+
 static bool pci_endpoint_test_bar(struct pci_endpoint_test *test,
 				  enum pci_barno barno)
 {
@@ -247,6 +342,11 @@ static bool pci_endpoint_test_copy(struct pci_endpoint_test *test, size_t size)
 	if (size > SIZE_MAX - alignment)
 		goto err;
 
+	if (irq_type < IRQ_TYPE_LEGACY || irq_type > IRQ_TYPE_MSIX) {
+		dev_err(dev, "Invalid IRQ type option\n");
+		goto err;
+	}
+
 	orig_src_addr = dma_alloc_coherent(dev, size + alignment,
 					   &orig_src_phys_addr, GFP_KERNEL);
 	if (!orig_src_addr) {
@@ -337,6 +437,11 @@ static bool pci_endpoint_test_write(struct pci_endpoint_test *test, size_t size)
 	if (size > SIZE_MAX - alignment)
 		goto err;
 
+	if (irq_type < IRQ_TYPE_LEGACY || irq_type > IRQ_TYPE_MSIX) {
+		dev_err(dev, "Invalid IRQ type option\n");
+		goto err;
+	}
+
 	orig_addr = dma_alloc_coherent(dev, size + alignment, &orig_phys_addr,
 				       GFP_KERNEL);
 	if (!orig_addr) {
@@ -400,6 +505,11 @@ static bool pci_endpoint_test_read(struct pci_endpoint_test *test, size_t size)
 	if (size > SIZE_MAX - alignment)
 		goto err;
 
+	if (irq_type < IRQ_TYPE_LEGACY || irq_type > IRQ_TYPE_MSIX) {
+		dev_err(dev, "Invalid IRQ type option\n");
+		goto err;
+	}
+
 	orig_addr = dma_alloc_coherent(dev, size + alignment, &orig_phys_addr,
 				       GFP_KERNEL);
 	if (!orig_addr) {
@@ -440,6 +550,38 @@ err:
 	return ret;
 }
 
+static bool pci_endpoint_test_set_irq(struct pci_endpoint_test *test,
+				      int req_irq_type)
+{
+	struct pci_dev *pdev = test->pdev;
+	struct device *dev = &pdev->dev;
+
+	if (req_irq_type < IRQ_TYPE_LEGACY || req_irq_type > IRQ_TYPE_MSIX) {
+		dev_err(dev, "Invalid IRQ type option\n");
+		return false;
+	}
+
+	if (irq_type == req_irq_type)
+		return true;
+
+	pci_endpoint_test_release_irq(test);
+	pci_endpoint_test_free_irq_vectors(test);
+
+	if (!pci_endpoint_test_alloc_irq_vectors(test, req_irq_type))
+		goto err;
+
+	if (!pci_endpoint_test_request_irq(test))
+		goto err;
+
+	irq_type = req_irq_type;
+	return true;
+
+err:
+	pci_endpoint_test_free_irq_vectors(test);
+	irq_type = IRQ_TYPE_UNDEFINED;
+	return false;
+}
+
 static long pci_endpoint_test_ioctl(struct file *file, unsigned int cmd,
 				    unsigned long arg)
 {
@@ -471,6 +613,12 @@ static long pci_endpoint_test_ioctl(struct file *file, unsigned int cmd,
 	case PCITEST_COPY:
 		ret = pci_endpoint_test_copy(test, arg);
 		break;
+	case PCITEST_SET_IRQTYPE:
+		ret = pci_endpoint_test_set_irq(test, arg);
+		break;
+	case PCITEST_GET_IRQTYPE:
+		ret = irq_type;
+		break;
 	}
 
 ret:
@@ -486,9 +634,7 @@ static const struct file_operations pci_endpoint_test_fops = {
 static int pci_endpoint_test_probe(struct pci_dev *pdev,
 				   const struct pci_device_id *ent)
 {
-	int i;
 	int err;
-	int irq = 0;
 	int id;
 	char name[20];
 	enum pci_barno bar;
@@ -537,41 +683,11 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 
 	pci_set_master(pdev);
 
-	switch (irq_type) {
-	case IRQ_TYPE_LEGACY:
-		break;
-	case IRQ_TYPE_MSI:
-		irq = pci_alloc_irq_vectors(pdev, 1, 32, PCI_IRQ_MSI);
-		if (irq < 0)
-			dev_err(dev, "Failed to get MSI interrupts\n");
-		test->num_irqs = irq;
-		break;
-	case IRQ_TYPE_MSIX:
-		irq = pci_alloc_irq_vectors(pdev, 1, 2048, PCI_IRQ_MSIX);
-		if (irq < 0)
-			dev_err(dev, "Failed to get MSI-X interrupts\n");
-		test->num_irqs = irq;
-		break;
-	default:
-		dev_err(dev, "Invalid IRQ type selected\n");
-	}
+	if (!pci_endpoint_test_alloc_irq_vectors(test, irq_type))
+		goto err_disable_irq;
 
-	err = devm_request_irq(dev, pdev->irq, pci_endpoint_test_irqhandler,
-			       IRQF_SHARED, DRV_MODULE_NAME, test);
-	if (err) {
-		dev_err(dev, "Failed to request IRQ %d\n", pdev->irq);
-		goto err_disable_msi;
-	}
-
-	for (i = 1; i < irq; i++) {
-		err = devm_request_irq(dev, pci_irq_vector(pdev, i),
-				       pci_endpoint_test_irqhandler,
-				       IRQF_SHARED, DRV_MODULE_NAME, test);
-		if (err)
-			dev_err(dev, "Failed to request IRQ %d for MSI%s %d\n",
-				pci_irq_vector(pdev, i),
-				irq_type == IRQ_TYPE_MSIX ? "-X" : "", i + 1);
-	}
+	if (!pci_endpoint_test_request_irq(test))
+		goto err_disable_irq;
 
 	for (bar = BAR_0; bar <= BAR_5; bar++) {
 		if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) {
@@ -630,13 +746,10 @@ err_iounmap:
 		if (test->bar[bar])
 			pci_iounmap(pdev, test->bar[bar]);
 	}
+	pci_endpoint_test_release_irq(test);
 
-	for (i = 0; i < irq; i++)
-		devm_free_irq(&pdev->dev, pci_irq_vector(pdev, i), test);
-
-err_disable_msi:
-	pci_disable_msi(pdev);
-	pci_disable_msix(pdev);
+err_disable_irq:
+	pci_endpoint_test_free_irq_vectors(test);
 	pci_release_regions(pdev);
 
 err_disable_pdev:
@@ -648,7 +761,6 @@ err_disable_pdev:
 static void pci_endpoint_test_remove(struct pci_dev *pdev)
 {
 	int id;
-	int i;
 	enum pci_barno bar;
 	struct pci_endpoint_test *test = pci_get_drvdata(pdev);
 	struct miscdevice *misc_device = &test->miscdev;
@@ -665,10 +777,10 @@ static void pci_endpoint_test_remove(struct pci_dev *pdev)
 		if (test->bar[bar])
 			pci_iounmap(pdev, test->bar[bar]);
 	}
-	for (i = 0; i < test->num_irqs; i++)
-		devm_free_irq(&pdev->dev, pci_irq_vector(pdev, i), test);
-	pci_disable_msi(pdev);
-	pci_disable_msix(pdev);
+
+	pci_endpoint_test_release_irq(test);
+	pci_endpoint_test_free_irq_vectors(test);
+
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 }
diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h
index d746fb159dcd..cbf422e56696 100644
--- a/include/uapi/linux/pcitest.h
+++ b/include/uapi/linux/pcitest.h
@@ -17,5 +17,7 @@
 #define PCITEST_READ		_IOW('P', 0x5, unsigned long)
 #define PCITEST_COPY		_IOW('P', 0x6, unsigned long)
 #define PCITEST_MSIX		_IOW('P', 0x7, int)
+#define PCITEST_SET_IRQTYPE	_IOW('P', 0x8, int)
+#define PCITEST_GET_IRQTYPE	_IO('P', 0x9)
 
 #endif /* __UAPI_LINUX_PCITEST_H */
-- 
cgit v1.2.3


From 44109c60176ae73924a42a6bef64ef151aba9095 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Tue, 15 May 2018 23:34:28 +0100
Subject: MIPS: Add DSP ASE regset support

Define an NT_MIPS_DSP core file note type and implement a corresponding
regset holding the DSP ASE register context, following the layout of the
`mips_dsp_state' structure, except for the DSPControl register stored as
a 64-bit rather than 32-bit quantity in a 64-bit note.

The lack of DSP ASE register saving to core files can be considered a
design flaw with commit e50c0a8fa60d ("Support the MIPS32 / MIPS64 DSP
ASE."), leading to an incomplete state being saved.  Consequently no DSP
ASE regset has been created with commit 7aeb753b5353 ("MIPS: Implement
task_user_regset_view."), when regset support was added to the MIPS
port.

Additionally there is no way for ptrace(2) to correctly access the DSP
accumulator registers in n32 processes with the existing interfaces.
This is due to 32-bit truncation of data passed with PTRACE_PEEKUSR and
PTRACE_POKEUSR requests, which cannot be avoided owing to how the data
types for ptrace(3) have been defined.  This new NT_MIPS_DSP regset
fills the missing interface gap.

[paul.burton@mips.com:
  - Change NT_MIPS_DSP to 0x800 to avoid conflict with NT_VMCOREDD
    introduced by commit 2724273e8fd0 ("vmcore: add API to collect
    hardware dump in second kernel").
  - Drop stable tag. Whilst I agree the lack of this functionality can
    be considered a flaw in earlier DSP ASE support, it's still new
    functionality which doesn't meet up to the requirements set out in
    Documentation/process/stable-kernel-rules.rst.]

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
References: 7aeb753b5353 ("MIPS: Implement task_user_regset_view.")
Patchwork: https://patchwork.linux-mips.org/patch/19330/
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: James Hogan <jhogan@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/ptrace.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/elf.h  |   1 +
 2 files changed, 190 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 8c8d42823bda..a536271ba084 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -41,6 +41,7 @@
 #include <asm/mipsmtregs.h>
 #include <asm/pgtable.h>
 #include <asm/page.h>
+#include <asm/processor.h>
 #include <asm/syscall.h>
 #include <linux/uaccess.h>
 #include <asm/bootinfo.h>
@@ -589,9 +590,179 @@ static int fpr_set(struct task_struct *target,
 	return err;
 }
 
+#if defined(CONFIG_32BIT) || defined(CONFIG_MIPS32_O32)
+
+/*
+ * Copy the DSP context to the supplied 32-bit NT_MIPS_DSP buffer.
+ */
+static int dsp32_get(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     void *kbuf, void __user *ubuf)
+{
+	unsigned int start, num_regs, i;
+	u32 dspregs[NUM_DSP_REGS + 1];
+
+	BUG_ON(count % sizeof(u32));
+
+	if (!cpu_has_dsp)
+		return -EIO;
+
+	start = pos / sizeof(u32);
+	num_regs = count / sizeof(u32);
+
+	if (start + num_regs > NUM_DSP_REGS + 1)
+		return -EIO;
+
+	for (i = start; i < num_regs; i++)
+		switch (i) {
+		case 0 ... NUM_DSP_REGS - 1:
+			dspregs[i] = target->thread.dsp.dspr[i];
+			break;
+		case NUM_DSP_REGS:
+			dspregs[i] = target->thread.dsp.dspcontrol;
+			break;
+		}
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, dspregs, 0,
+				   sizeof(dspregs));
+}
+
+/*
+ * Copy the supplied 32-bit NT_MIPS_DSP buffer to the DSP context.
+ */
+static int dsp32_set(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     const void *kbuf, const void __user *ubuf)
+{
+	unsigned int start, num_regs, i;
+	u32 dspregs[NUM_DSP_REGS + 1];
+	int err;
+
+	BUG_ON(count % sizeof(u32));
+
+	if (!cpu_has_dsp)
+		return -EIO;
+
+	start = pos / sizeof(u32);
+	num_regs = count / sizeof(u32);
+
+	if (start + num_regs > NUM_DSP_REGS + 1)
+		return -EIO;
+
+	err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, dspregs, 0,
+				 sizeof(dspregs));
+	if (err)
+		return err;
+
+	for (i = start; i < num_regs; i++)
+		switch (i) {
+		case 0 ... NUM_DSP_REGS - 1:
+			target->thread.dsp.dspr[i] = (s32)dspregs[i];
+			break;
+		case NUM_DSP_REGS:
+			target->thread.dsp.dspcontrol = (s32)dspregs[i];
+			break;
+		}
+
+	return 0;
+}
+
+#endif /* CONFIG_32BIT || CONFIG_MIPS32_O32 */
+
+#ifdef CONFIG_64BIT
+
+/*
+ * Copy the DSP context to the supplied 64-bit NT_MIPS_DSP buffer.
+ */
+static int dsp64_get(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     void *kbuf, void __user *ubuf)
+{
+	unsigned int start, num_regs, i;
+	u64 dspregs[NUM_DSP_REGS + 1];
+
+	BUG_ON(count % sizeof(u64));
+
+	if (!cpu_has_dsp)
+		return -EIO;
+
+	start = pos / sizeof(u64);
+	num_regs = count / sizeof(u64);
+
+	if (start + num_regs > NUM_DSP_REGS + 1)
+		return -EIO;
+
+	for (i = start; i < num_regs; i++)
+		switch (i) {
+		case 0 ... NUM_DSP_REGS - 1:
+			dspregs[i] = target->thread.dsp.dspr[i];
+			break;
+		case NUM_DSP_REGS:
+			dspregs[i] = target->thread.dsp.dspcontrol;
+			break;
+		}
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, dspregs, 0,
+				   sizeof(dspregs));
+}
+
+/*
+ * Copy the supplied 64-bit NT_MIPS_DSP buffer to the DSP context.
+ */
+static int dsp64_set(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     const void *kbuf, const void __user *ubuf)
+{
+	unsigned int start, num_regs, i;
+	u64 dspregs[NUM_DSP_REGS + 1];
+	int err;
+
+	BUG_ON(count % sizeof(u64));
+
+	if (!cpu_has_dsp)
+		return -EIO;
+
+	start = pos / sizeof(u64);
+	num_regs = count / sizeof(u64);
+
+	if (start + num_regs > NUM_DSP_REGS + 1)
+		return -EIO;
+
+	err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, dspregs, 0,
+				 sizeof(dspregs));
+	if (err)
+		return err;
+
+	for (i = start; i < num_regs; i++)
+		switch (i) {
+		case 0 ... NUM_DSP_REGS - 1:
+			target->thread.dsp.dspr[i] = dspregs[i];
+			break;
+		case NUM_DSP_REGS:
+			target->thread.dsp.dspcontrol = dspregs[i];
+			break;
+		}
+
+	return 0;
+}
+
+#endif /* CONFIG_64BIT */
+
+/*
+ * Determine whether the DSP context is present.
+ */
+static int dsp_active(struct task_struct *target,
+		      const struct user_regset *regset)
+{
+	return cpu_has_dsp ? NUM_DSP_REGS + 1 : -ENODEV;
+}
+
 enum mips_regset {
 	REGSET_GPR,
 	REGSET_FPR,
+	REGSET_DSP,
 };
 
 struct pt_regs_offset {
@@ -697,6 +868,15 @@ static const struct user_regset mips_regsets[] = {
 		.get		= fpr_get,
 		.set		= fpr_set,
 	},
+	[REGSET_DSP] = {
+		.core_note_type	= NT_MIPS_DSP,
+		.n		= NUM_DSP_REGS + 1,
+		.size		= sizeof(u32),
+		.align		= sizeof(u32),
+		.get		= dsp32_get,
+		.set		= dsp32_set,
+		.active		= dsp_active,
+	},
 };
 
 static const struct user_regset_view user_mips_view = {
@@ -728,6 +908,15 @@ static const struct user_regset mips64_regsets[] = {
 		.get		= fpr_get,
 		.set		= fpr_set,
 	},
+	[REGSET_DSP] = {
+		.core_note_type	= NT_MIPS_DSP,
+		.n		= NUM_DSP_REGS + 1,
+		.size		= sizeof(u64),
+		.align		= sizeof(u64),
+		.get		= dsp64_get,
+		.set		= dsp64_set,
+		.active		= dsp_active,
+	},
 };
 
 static const struct user_regset_view user_mips64_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 4e12c423b9fe..e326c99b3881 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -422,6 +422,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
+#define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
 
 /* Note header in a PT_NOTE section */
 typedef struct elf32_note {
-- 
cgit v1.2.3


From 1ae22a0e35636efceab83728ba30b013df761592 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Tue, 15 May 2018 23:40:18 +0100
Subject: MIPS: Add FP_MODE regset support

Define an NT_MIPS_FP_MODE core file note and implement a corresponding
regset holding the state handled by PR_SET_FP_MODE and PR_GET_FP_MODE
prctl(2) requests.  This lets debug software correctly interpret the
contents of floating-point general registers both in live debugging and
in core files, and also switch floating-point modes of a live process.

[paul.burton@mips.com:
  - Changed NT_MIPS_FP_MODE to 0x801 to match first nibble of
    NT_MIPS_DSP, which was also changed to avoid a conflict.]

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Patchwork: https://patchwork.linux-mips.org/patch/19331/
Cc: James Hogan <jhogan@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/ptrace.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/elf.h  |  1 +
 2 files changed, 64 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index a536271ba084..e5ba56c01ee0 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -759,10 +759,57 @@ static int dsp_active(struct task_struct *target,
 	return cpu_has_dsp ? NUM_DSP_REGS + 1 : -ENODEV;
 }
 
+/* Copy the FP mode setting to the supplied NT_MIPS_FP_MODE buffer.  */
+static int fp_mode_get(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       void *kbuf, void __user *ubuf)
+{
+	int fp_mode;
+
+	fp_mode = mips_get_process_fp_mode(target);
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fp_mode, 0,
+				   sizeof(fp_mode));
+}
+
+/*
+ * Copy the supplied NT_MIPS_FP_MODE buffer to the FP mode setting.
+ *
+ * We optimize for the case where `count % sizeof(int) == 0', which
+ * is supposed to have been guaranteed by the kernel before calling
+ * us, e.g. in `ptrace_regset'.  We enforce that requirement, so
+ * that we can safely avoid preinitializing temporaries for partial
+ * mode writes.
+ */
+static int fp_mode_set(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       const void *kbuf, const void __user *ubuf)
+{
+	int fp_mode;
+	int err;
+
+	BUG_ON(count % sizeof(int));
+
+	if (pos + count > sizeof(fp_mode))
+		return -EIO;
+
+	err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fp_mode, 0,
+				 sizeof(fp_mode));
+	if (err)
+		return err;
+
+	if (count > 0)
+		err = mips_set_process_fp_mode(target, fp_mode);
+
+	return err;
+}
+
 enum mips_regset {
 	REGSET_GPR,
 	REGSET_FPR,
 	REGSET_DSP,
+	REGSET_FP_MODE,
 };
 
 struct pt_regs_offset {
@@ -877,6 +924,14 @@ static const struct user_regset mips_regsets[] = {
 		.set		= dsp32_set,
 		.active		= dsp_active,
 	},
+	[REGSET_FP_MODE] = {
+		.core_note_type	= NT_MIPS_FP_MODE,
+		.n		= 1,
+		.size		= sizeof(int),
+		.align		= sizeof(int),
+		.get		= fp_mode_get,
+		.set		= fp_mode_set,
+	},
 };
 
 static const struct user_regset_view user_mips_view = {
@@ -917,6 +972,14 @@ static const struct user_regset mips64_regsets[] = {
 		.set		= dsp64_set,
 		.active		= dsp_active,
 	},
+	[REGSET_FP_MODE] = {
+		.core_note_type	= NT_MIPS_FP_MODE,
+		.n		= 1,
+		.size		= sizeof(int),
+		.align		= sizeof(int),
+		.get		= fp_mode_get,
+		.set		= fp_mode_set,
+	},
 };
 
 static const struct user_regset_view user_mips64_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index e326c99b3881..c5358e0ae7c5 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -423,6 +423,7 @@ typedef struct elf64_shdr {
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
+#define NT_MIPS_FP_MODE	0x801		/* MIPS floating-point mode */
 
 /* Note header in a PT_NOTE section */
 typedef struct elf32_note {
-- 
cgit v1.2.3


From 07a557f47d7e09b2c60ad4d51b1ac8b035b75f73 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Tue, 17 Jul 2018 19:27:16 +0300
Subject: net/sched: tunnel_key: Allow to set tos and ttl for tc based ip
 tunnels

Allow user-space to provide tos and ttl to be set for the tunnel headers.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_tunnel_key.h |  2 ++
 net/sched/act_tunnel_key.c                | 20 ++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
index e284fec8c467..be384d63e1b5 100644
--- a/include/uapi/linux/tc_act/tc_tunnel_key.h
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -39,6 +39,8 @@ enum {
 	TCA_TUNNEL_KEY_ENC_OPTS,	/* Nested TCA_TUNNEL_KEY_ENC_OPTS_
 					 * attributes
 					 */
+	TCA_TUNNEL_KEY_ENC_TOS,		/* u8 */
+	TCA_TUNNEL_KEY_ENC_TTL,		/* u8 */
 	__TCA_TUNNEL_KEY_MAX,
 };
 
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 3ec585d58762..22f26e9ea8f1 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -197,6 +197,8 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
 	[TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16},
 	[TCA_TUNNEL_KEY_NO_CSUM]      = { .type = NLA_U8 },
 	[TCA_TUNNEL_KEY_ENC_OPTS]     = { .type = NLA_NESTED },
+	[TCA_TUNNEL_KEY_ENC_TOS]      = { .type = NLA_U8 },
+	[TCA_TUNNEL_KEY_ENC_TTL]      = { .type = NLA_U8 },
 };
 
 static int tunnel_key_init(struct net *net, struct nlattr *nla,
@@ -216,6 +218,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	int opts_len = 0;
 	__be64 key_id;
 	__be16 flags;
+	u8 tos, ttl;
 	int ret = 0;
 	int err;
 
@@ -273,6 +276,13 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			}
 		}
 
+		tos = 0;
+		if (tb[TCA_TUNNEL_KEY_ENC_TOS])
+			tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]);
+		ttl = 0;
+		if (tb[TCA_TUNNEL_KEY_ENC_TTL])
+			ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]);
+
 		if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
 		    tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
 			__be32 saddr;
@@ -281,7 +291,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
 			daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
 
-			metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
+			metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl,
 						    dst_port, flags,
 						    key_id, opts_len);
 		} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
@@ -292,7 +302,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
 			daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
 
-			metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port,
+			metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port,
 						      0, flags,
 						      key_id, 0);
 		} else {
@@ -504,6 +514,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 			       !(key->tun_flags & TUNNEL_CSUM)) ||
 		    tunnel_key_opts_dump(skb, info))
 			goto nla_put_failure;
+
+		if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos))
+			goto nla_put_failure;
+
+		if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl))
+			goto nla_put_failure;
 	}
 
 	tcf_tm_dump(&tm, &t->tcf_tm);
-- 
cgit v1.2.3


From 0e2c17b64d5c7f57bcd7054ef87797376dcdee26 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Tue, 17 Jul 2018 19:27:18 +0300
Subject: net/sched: cls_flower: Support matching on ip tos and ttl for tunnels

Allow users to set rules matching on ipv4 tos and ttl or
ipv6 traffic-class and hoplimit of tunnel headers.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  5 +++++
 net/sched/cls_flower.c       | 43 ++++++++++++++++++++++++++++---------------
 2 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index c4262d911596..b4512254036b 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -473,6 +473,11 @@ enum {
 	TCA_FLOWER_KEY_CVLAN_PRIO,	/* u8   */
 	TCA_FLOWER_KEY_CVLAN_ETH_TYPE,	/* be16 */
 
+	TCA_FLOWER_KEY_ENC_IP_TOS,	/* u8 */
+	TCA_FLOWER_KEY_ENC_IP_TOS_MASK,	/* u8 */
+	TCA_FLOWER_KEY_ENC_IP_TTL,	/* u8 */
+	TCA_FLOWER_KEY_ENC_IP_TTL_MASK,	/* u8 */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c53fdd411f90..38d74803e2df 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -52,6 +52,7 @@ struct fl_flow_key {
 	struct flow_dissector_key_mpls mpls;
 	struct flow_dissector_key_tcp tcp;
 	struct flow_dissector_key_ip ip;
+	struct flow_dissector_key_ip enc_ip;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -453,6 +454,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_CVLAN_ID]	= { .type = NLA_U16 },
 	[TCA_FLOWER_KEY_CVLAN_PRIO]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_CVLAN_ETH_TYPE]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_ENC_IP_TOS]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ENC_IP_TTL]	 = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -561,17 +566,17 @@ static int fl_set_key_flags(struct nlattr **tb,
 	return 0;
 }
 
-static void fl_set_key_ip(struct nlattr **tb,
+static void fl_set_key_ip(struct nlattr **tb, bool encap,
 			  struct flow_dissector_key_ip *key,
 			  struct flow_dissector_key_ip *mask)
 {
-		fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS,
-			       &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK,
-			       sizeof(key->tos));
+	int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
+	int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
+	int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
+	int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
 
-		fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL,
-			       &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK,
-			       sizeof(key->ttl));
+	fl_set_key_val(tb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos));
+	fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl));
 }
 
 static int fl_set_key(struct net *net, struct nlattr **tb,
@@ -633,7 +638,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 		fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
 			       &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
 			       sizeof(key->basic.ip_proto));
-		fl_set_key_ip(tb, &key->ip, &mask->ip);
+		fl_set_key_ip(tb, false, &key->ip, &mask->ip);
 	}
 
 	if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
@@ -768,6 +773,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 		       &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
 		       sizeof(key->enc_tp.dst));
 
+	fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip);
+
 	if (tb[TCA_FLOWER_KEY_FLAGS])
 		ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
 
@@ -860,6 +867,8 @@ static void fl_init_dissector(struct fl_flow_mask *mask)
 			   enc_control);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
+	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+			     FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
 
 	skb_flow_dissector_init(&mask->dissector, keys, cnt);
 }
@@ -1208,14 +1217,17 @@ static int fl_dump_key_mpls(struct sk_buff *skb,
 	return 0;
 }
 
-static int fl_dump_key_ip(struct sk_buff *skb,
+static int fl_dump_key_ip(struct sk_buff *skb, bool encap,
 			  struct flow_dissector_key_ip *key,
 			  struct flow_dissector_key_ip *mask)
 {
-	if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos,
-			    TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) ||
-	    fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl,
-			    TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl)))
+	int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
+	int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
+	int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
+	int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
+
+	if (fl_dump_key_val(skb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)) ||
+	    fl_dump_key_val(skb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)))
 		return -1;
 
 	return 0;
@@ -1361,7 +1373,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
 	    (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
 			    &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
 			    sizeof(key->basic.ip_proto)) ||
-	    fl_dump_key_ip(skb, &key->ip, &mask->ip)))
+	    fl_dump_key_ip(skb, false, &key->ip, &mask->ip)))
 		goto nla_put_failure;
 
 	if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
@@ -1486,7 +1498,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
 			    TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
 			    &mask->enc_tp.dst,
 			    TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
-			    sizeof(key->enc_tp.dst)))
+			    sizeof(key->enc_tp.dst)) ||
+	    fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip))
 		goto nla_put_failure;
 
 	if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
-- 
cgit v1.2.3


From 36fc3c8c282c01ad1570bd864de52f128d731b75 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 19 Jul 2018 22:14:31 -0700
Subject: bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h

This patch shrinks the BTF_INT_BITS() mask.  The current
btf_int_check_meta() ensures the nr_bits of an integer
cannot exceed 64.  Hence, it is mostly an uapi cleanup.

The actual btf usage (i.e. seq_show()) is also modified
to use u8 instead of u16.  The verification (e.g. btf_int_check_meta())
path stays as is to deal with invalid BTF situation.

Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h |  2 +-
 kernel/bpf/btf.c         | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 0b5ddbe135a4..972265f32871 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -76,7 +76,7 @@ struct btf_type {
  */
 #define BTF_INT_ENCODING(VAL)	(((VAL) & 0x0f000000) >> 24)
 #define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
-#define BTF_INT_BITS(VAL)	((VAL)  & 0x0000ffff)
+#define BTF_INT_BITS(VAL)	((VAL)  & 0x000000ff)
 
 /* Attributes stored in the BTF_INT_ENCODING */
 #define BTF_INT_SIGNED	(1 << 0)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e016ac3afa24..9704934252b3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -450,7 +450,7 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
  */
 static bool btf_type_int_is_regular(const struct btf_type *t)
 {
-	u16 nr_bits, nr_bytes;
+	u8 nr_bits, nr_bytes;
 	u32 int_data;
 
 	int_data = btf_type_int(t);
@@ -993,12 +993,16 @@ static void btf_int_bits_seq_show(const struct btf *btf,
 {
 	u16 left_shift_bits, right_shift_bits;
 	u32 int_data = btf_type_int(t);
-	u16 nr_bits = BTF_INT_BITS(int_data);
-	u16 total_bits_offset;
-	u16 nr_copy_bytes;
-	u16 nr_copy_bits;
+	u8 nr_bits = BTF_INT_BITS(int_data);
+	u8 total_bits_offset;
+	u8 nr_copy_bytes;
+	u8 nr_copy_bits;
 	u64 print_num;
 
+	/*
+	 * bits_offset is at most 7.
+	 * BTF_INT_OFFSET() cannot exceed 64 bits.
+	 */
 	total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
 	data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
 	bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
@@ -1028,7 +1032,7 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
 	u32 int_data = btf_type_int(t);
 	u8 encoding = BTF_INT_ENCODING(int_data);
 	bool sign = encoding & BTF_INT_SIGNED;
-	u32 nr_bits = BTF_INT_BITS(int_data);
+	u8 nr_bits = BTF_INT_BITS(int_data);
 
 	if (bits_offset || BTF_INT_OFFSET(int_data) ||
 	    BITS_PER_BYTE_MASKED(nr_bits)) {
-- 
cgit v1.2.3


From 048c6d88a0214757926f264823829e79154fcd4f Mon Sep 17 00:00:00 2001
From: Guido Kiener <guido@kiener-muenchen.de>
Date: Wed, 18 Jul 2018 10:45:36 +0200
Subject: usb: usbtmc: Add ioctls to set/get usb timeout

Add ioctls USBTMC_IOCTL_GET_TIMEOUT / USBTMC_IOCTL_SET_TIMEOUT to
get/set I/O timeout for specific file handle.

Different operations on an instrument can take different lengths of
time thus it is important to be able to set the timeout slightly
longer than the expected duration of each operation to optimise the
responsiveness of the application. As the instrument may be shared by
multiple applications the timeout should be settable on a per file
descriptor basis.

Tested-by: Dave Penkler <dpenkler@gmail.com>
Reviewed-by: Steve Bayless <steve_bayless@keysight.com>
Signed-off-by: Dave Penkler <dpenkler@gmail.com>
Signed-off-by: Guido Kiener <guido.kiener@rohde-schwarz.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/class/usbtmc.c   | 65 ++++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/usb/tmc.h |  4 +++
 2 files changed, 63 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 243e8446b8dd..36d740c4c6fb 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -30,6 +30,8 @@
  */
 #define USBTMC_SIZE_IOBUFFER	2048
 
+/* Minimum USB timeout (in milliseconds) */
+#define USBTMC_MIN_TIMEOUT	100
 /* Default USB timeout (in milliseconds) */
 #define USBTMC_TIMEOUT		5000
 
@@ -115,6 +117,7 @@ struct usbtmc_file_data {
 	struct usbtmc_device_data *data;
 	struct list_head file_elem;
 
+	u32            timeout;
 	u8             srq_byte;
 	atomic_t       srq_asserted;
 };
@@ -153,6 +156,8 @@ static int usbtmc_open(struct inode *inode, struct file *filp)
 	mutex_lock(&data->io_mutex);
 	file_data->data = data;
 
+	file_data->timeout = USBTMC_TIMEOUT;
+
 	INIT_LIST_HEAD(&file_data->file_elem);
 	spin_lock_irq(&data->dev_lock);
 	list_add_tail(&file_data->file_elem, &data->file_list);
@@ -460,7 +465,7 @@ static int usbtmc488_ioctl_read_stb(struct usbtmc_file_data *file_data,
 		rv = wait_event_interruptible_timeout(
 			data->waitq,
 			atomic_read(&data->iin_data_valid) != 0,
-			USBTMC_TIMEOUT);
+			file_data->timeout);
 		if (rv < 0) {
 			dev_dbg(dev, "wait interrupted %d\n", rv);
 			goto exit;
@@ -560,8 +565,10 @@ static int usbtmc488_ioctl_simple(struct usbtmc_device_data *data,
  *
  * Also updates bTag_last_write.
  */
-static int send_request_dev_dep_msg_in(struct usbtmc_device_data *data, size_t transfer_size)
+static int send_request_dev_dep_msg_in(struct usbtmc_file_data *file_data,
+				       size_t transfer_size)
 {
+	struct usbtmc_device_data *data = file_data->data;
 	int retval;
 	u8 *buffer;
 	int actual;
@@ -590,7 +597,8 @@ static int send_request_dev_dep_msg_in(struct usbtmc_device_data *data, size_t t
 	retval = usb_bulk_msg(data->usb_dev,
 			      usb_sndbulkpipe(data->usb_dev,
 					      data->bulk_out),
-			      buffer, USBTMC_HEADER_SIZE, &actual, USBTMC_TIMEOUT);
+			      buffer, USBTMC_HEADER_SIZE,
+			      &actual, file_data->timeout);
 
 	/* Store bTag (in case we need to abort) */
 	data->bTag_last_write = data->bTag;
@@ -640,7 +648,7 @@ static ssize_t usbtmc_read(struct file *filp, char __user *buf,
 
 	dev_dbg(dev, "usb_bulk_msg_in: count(%zu)\n", count);
 
-	retval = send_request_dev_dep_msg_in(data, count);
+	retval = send_request_dev_dep_msg_in(file_data, count);
 
 	if (retval < 0) {
 		if (data->auto_abort)
@@ -659,7 +667,7 @@ static ssize_t usbtmc_read(struct file *filp, char __user *buf,
 				      usb_rcvbulkpipe(data->usb_dev,
 						      data->bulk_in),
 				      buffer, USBTMC_SIZE_IOBUFFER, &actual,
-				      USBTMC_TIMEOUT);
+				      file_data->timeout);
 
 		dev_dbg(dev, "usb_bulk_msg: retval(%u), done(%zu), remaining(%zu), actual(%d)\n", retval, done, remaining, actual);
 
@@ -832,7 +840,7 @@ static ssize_t usbtmc_write(struct file *filp, const char __user *buf,
 					      usb_sndbulkpipe(data->usb_dev,
 							      data->bulk_out),
 					      buffer, n_bytes,
-					      &actual, USBTMC_TIMEOUT);
+					      &actual, file_data->timeout);
 			if (retval != 0)
 				break;
 			n_bytes -= actual;
@@ -1189,6 +1197,41 @@ exit:
 	return rv;
 }
 
+/*
+ * Get the usb timeout value
+ */
+static int usbtmc_ioctl_get_timeout(struct usbtmc_file_data *file_data,
+				void __user *arg)
+{
+	u32 timeout;
+
+	timeout = file_data->timeout;
+
+	return put_user(timeout, (__u32 __user *)arg);
+}
+
+/*
+ * Set the usb timeout value
+ */
+static int usbtmc_ioctl_set_timeout(struct usbtmc_file_data *file_data,
+				void __user *arg)
+{
+	u32 timeout;
+
+	if (get_user(timeout, (__u32 __user *)arg))
+		return -EFAULT;
+
+	/* Note that timeout = 0 means
+	 * MAX_SCHEDULE_TIMEOUT in usb_control_msg
+	 */
+	if (timeout < USBTMC_MIN_TIMEOUT)
+		return -EINVAL;
+
+	file_data->timeout = timeout;
+
+	return 0;
+}
+
 static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct usbtmc_file_data *file_data;
@@ -1229,6 +1272,16 @@ static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		retval = usbtmc_ioctl_abort_bulk_in(data);
 		break;
 
+	case USBTMC_IOCTL_GET_TIMEOUT:
+		retval = usbtmc_ioctl_get_timeout(file_data,
+						  (void __user *)arg);
+		break;
+
+	case USBTMC_IOCTL_SET_TIMEOUT:
+		retval = usbtmc_ioctl_set_timeout(file_data,
+						  (void __user *)arg);
+		break;
+
 	case USBTMC488_IOCTL_GET_CAPS:
 		retval = copy_to_user((void __user *)arg,
 				&data->usb488_caps,
diff --git a/include/uapi/linux/usb/tmc.h b/include/uapi/linux/usb/tmc.h
index 03f6adc8f35b..a89ffc33532e 100644
--- a/include/uapi/linux/usb/tmc.h
+++ b/include/uapi/linux/usb/tmc.h
@@ -16,6 +16,8 @@
 #ifndef __LINUX_USB_TMC_H
 #define __LINUX_USB_TMC_H
 
+#include <linux/types.h>   /* __u8 etc */
+
 /* USB TMC status values */
 #define USBTMC_STATUS_SUCCESS				0x01
 #define USBTMC_STATUS_PENDING				0x02
@@ -46,6 +48,8 @@
 #define USBTMC_IOCTL_ABORT_BULK_IN	_IO(USBTMC_IOC_NR, 4)
 #define USBTMC_IOCTL_CLEAR_OUT_HALT	_IO(USBTMC_IOC_NR, 6)
 #define USBTMC_IOCTL_CLEAR_IN_HALT	_IO(USBTMC_IOC_NR, 7)
+#define USBTMC_IOCTL_GET_TIMEOUT	_IOR(USBTMC_IOC_NR, 9, __u32)
+#define USBTMC_IOCTL_SET_TIMEOUT	_IOW(USBTMC_IOC_NR, 10, __u32)
 #define USBTMC488_IOCTL_GET_CAPS	_IOR(USBTMC_IOC_NR, 17, unsigned char)
 #define USBTMC488_IOCTL_READ_STB	_IOR(USBTMC_IOC_NR, 18, unsigned char)
 #define USBTMC488_IOCTL_REN_CONTROL	_IOW(USBTMC_IOC_NR, 19, unsigned char)
-- 
cgit v1.2.3


From fe78a7c637057070f20ac9460608a18d775e6349 Mon Sep 17 00:00:00 2001
From: Guido Kiener <guido@kiener-muenchen.de>
Date: Wed, 18 Jul 2018 10:45:37 +0200
Subject: usb: usbtmc: Add ioctl for trigger

add USBTMC488_IOCTL_TRIGGER to send TRIGGER Bulk-OUT header
according to Subclass USB488 Specification

The usbtmc trigger command is equivalent to the IEEE 488 GET (Group
Execute Trigger) action. While the "*TRG" command can be sent as
data to perform the same operation, in some situations an instrument
will be busy and unable to process the data immediately in which
case the USBTMC488_IOCTL_TRIGGER can be used to trigger the
instrument with lower latency.

Reviewed-by: Steve Bayless <steve_bayless@keysight.com>
Tested-by: Dave Penkler <dpenkler@gmail.com>
Signed-off-by: Dave Penkler <dpenkler@gmail.com>
Signed-off-by: Guido Kiener <guido.kiener@rohde-schwarz.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/class/usbtmc.c   | 49 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/usb/tmc.h |  1 +
 2 files changed, 50 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 36d740c4c6fb..38fc7abdc00c 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -557,6 +557,51 @@ static int usbtmc488_ioctl_simple(struct usbtmc_device_data *data,
 	return rv;
 }
 
+/*
+ * Sends a TRIGGER Bulk-OUT command message
+ * See the USBTMC-USB488 specification, Table 2.
+ *
+ * Also updates bTag_last_write.
+ */
+static int usbtmc488_ioctl_trigger(struct usbtmc_file_data *file_data)
+{
+	struct usbtmc_device_data *data = file_data->data;
+	int retval;
+	u8 *buffer;
+	int actual;
+
+	buffer = kzalloc(USBTMC_HEADER_SIZE, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	buffer[0] = 128;
+	buffer[1] = data->bTag;
+	buffer[2] = ~data->bTag;
+
+	retval = usb_bulk_msg(data->usb_dev,
+			      usb_sndbulkpipe(data->usb_dev,
+					      data->bulk_out),
+			      buffer, USBTMC_HEADER_SIZE,
+			      &actual, file_data->timeout);
+
+	/* Store bTag (in case we need to abort) */
+	data->bTag_last_write = data->bTag;
+
+	/* Increment bTag -- and increment again if zero */
+	data->bTag++;
+	if (!data->bTag)
+		data->bTag++;
+
+	kfree(buffer);
+	if (retval < 0) {
+		dev_err(&data->intf->dev, "%s returned %d\n",
+			__func__, retval);
+		return retval;
+	}
+
+	return 0;
+}
+
 /*
  * Sends a REQUEST_DEV_DEP_MSG_IN message on the Bulk-OUT endpoint.
  * @transfer_size: number of bytes to request from the device.
@@ -1309,6 +1354,10 @@ static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		retval = usbtmc488_ioctl_simple(data, (void __user *)arg,
 						USBTMC488_REQUEST_LOCAL_LOCKOUT);
 		break;
+
+	case USBTMC488_IOCTL_TRIGGER:
+		retval = usbtmc488_ioctl_trigger(file_data);
+		break;
 	}
 
 skip_io_on_zombie:
diff --git a/include/uapi/linux/usb/tmc.h b/include/uapi/linux/usb/tmc.h
index a89ffc33532e..c61bad7150dd 100644
--- a/include/uapi/linux/usb/tmc.h
+++ b/include/uapi/linux/usb/tmc.h
@@ -55,6 +55,7 @@
 #define USBTMC488_IOCTL_REN_CONTROL	_IOW(USBTMC_IOC_NR, 19, unsigned char)
 #define USBTMC488_IOCTL_GOTO_LOCAL	_IO(USBTMC_IOC_NR, 20)
 #define USBTMC488_IOCTL_LOCAL_LOCKOUT	_IO(USBTMC_IOC_NR, 21)
+#define USBTMC488_IOCTL_TRIGGER		_IO(USBTMC_IOC_NR, 22)
 
 /* Driver encoded usb488 capabilities */
 #define USBTMC488_CAPABILITY_TRIGGER         1
-- 
cgit v1.2.3


From fbd83971f9429849dd3a105b663822d15b7b992b Mon Sep 17 00:00:00 2001
From: Guido Kiener <guido@kiener-muenchen.de>
Date: Wed, 18 Jul 2018 10:45:38 +0200
Subject: usb: usbtmc: Add ioctl for EOM bit

add USBTMC_IOCTL_EOM_ENABLE to specify EOM bit for next write()
call. Sets Bit 0 of field 'bmTransferAttributes' of DEV_DEP_MSG_OUT
Bulk-OUT Header.
Allows fine grained control over end of message handling on a
per file descriptor basis.

Reviewed-by: Steve Bayless <steve_bayless@keysight.com>
Tested-by: Dave Penkler <dpenkler@gmail.com>
Signed-off-by: Dave Penkler <dpenkler@gmail.com>
Signed-off-by: Guido Kiener <guido.kiener@rohde-schwarz.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/class/usbtmc.c   | 28 +++++++++++++++++++++++++++-
 include/uapi/linux/usb/tmc.h |  2 ++
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 38fc7abdc00c..c77e0ac6260b 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -120,6 +120,7 @@ struct usbtmc_file_data {
 	u32            timeout;
 	u8             srq_byte;
 	atomic_t       srq_asserted;
+	u8             eom_val;
 };
 
 /* Forward declarations */
@@ -157,6 +158,7 @@ static int usbtmc_open(struct inode *inode, struct file *filp)
 	file_data->data = data;
 
 	file_data->timeout = USBTMC_TIMEOUT;
+	file_data->eom_val = 1;
 
 	INIT_LIST_HEAD(&file_data->file_elem);
 	spin_lock_irq(&data->dev_lock);
@@ -855,7 +857,7 @@ static ssize_t usbtmc_write(struct file *filp, const char __user *buf,
 			buffer[8] = 0;
 		} else {
 			this_part = remaining;
-			buffer[8] = 1;
+			buffer[8] = file_data->eom_val;
 		}
 
 		/* Setup IO buffer for DEV_DEP_MSG_OUT message */
@@ -1277,6 +1279,25 @@ static int usbtmc_ioctl_set_timeout(struct usbtmc_file_data *file_data,
 	return 0;
 }
 
+/*
+ * enables/disables sending EOM on write
+ */
+static int usbtmc_ioctl_eom_enable(struct usbtmc_file_data *file_data,
+				void __user *arg)
+{
+	u8 eom_enable;
+
+	if (copy_from_user(&eom_enable, arg, sizeof(eom_enable)))
+		return -EFAULT;
+
+	if (eom_enable > 1)
+		return -EINVAL;
+
+	file_data->eom_val = eom_enable;
+
+	return 0;
+}
+
 static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct usbtmc_file_data *file_data;
@@ -1327,6 +1348,11 @@ static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 						  (void __user *)arg);
 		break;
 
+	case USBTMC_IOCTL_EOM_ENABLE:
+		retval = usbtmc_ioctl_eom_enable(file_data,
+						 (void __user *)arg);
+		break;
+
 	case USBTMC488_IOCTL_GET_CAPS:
 		retval = copy_to_user((void __user *)arg,
 				&data->usb488_caps,
diff --git a/include/uapi/linux/usb/tmc.h b/include/uapi/linux/usb/tmc.h
index c61bad7150dd..e7317dfdd2ae 100644
--- a/include/uapi/linux/usb/tmc.h
+++ b/include/uapi/linux/usb/tmc.h
@@ -50,6 +50,8 @@
 #define USBTMC_IOCTL_CLEAR_IN_HALT	_IO(USBTMC_IOC_NR, 7)
 #define USBTMC_IOCTL_GET_TIMEOUT	_IOR(USBTMC_IOC_NR, 9, __u32)
 #define USBTMC_IOCTL_SET_TIMEOUT	_IOW(USBTMC_IOC_NR, 10, __u32)
+#define USBTMC_IOCTL_EOM_ENABLE	        _IOW(USBTMC_IOC_NR, 11, __u8)
+
 #define USBTMC488_IOCTL_GET_CAPS	_IOR(USBTMC_IOC_NR, 17, unsigned char)
 #define USBTMC488_IOCTL_READ_STB	_IOR(USBTMC_IOC_NR, 18, unsigned char)
 #define USBTMC488_IOCTL_REN_CONTROL	_IOW(USBTMC_IOC_NR, 19, unsigned char)
-- 
cgit v1.2.3


From 12dcaeb77e67c1162a2604f6b589266baec2d1ef Mon Sep 17 00:00:00 2001
From: Guido Kiener <guido@kiener-muenchen.de>
Date: Wed, 18 Jul 2018 10:45:39 +0200
Subject: usb: usbtmc: Add ioctl for termination character

add USBTMC_IOCTL_CONFIG_TERMCHAR to control TermChar handling
for next read(). Controls field 'TermChar' and Bit 1 of field
'bmTransferAttributes' of REQUEST_DEV_DEP_MSG_IN BULK-OUT header.

Allows enabling/disabling of terminating a read on reception of
term_char individually for each read request.

Reviewed-by: Steve Bayless <steve_bayless@keysight.com>
Tested-by: Dave Penkler <dpenkler@gmail.com>
Signed-off-by: Dave Penkler <dpenkler@gmail.com>
Signed-off-by: Guido Kiener <guido.kiener@rohde-schwarz.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/class/usbtmc.c   | 36 ++++++++++++++++++++++++++++++++++--
 include/uapi/linux/usb/tmc.h |  6 ++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index c77e0ac6260b..1b7b2e402adb 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -121,6 +121,8 @@ struct usbtmc_file_data {
 	u8             srq_byte;
 	atomic_t       srq_asserted;
 	u8             eom_val;
+	u8             term_char;
+	bool           term_char_enabled;
 };
 
 /* Forward declarations */
@@ -157,7 +159,10 @@ static int usbtmc_open(struct inode *inode, struct file *filp)
 	mutex_lock(&data->io_mutex);
 	file_data->data = data;
 
+	/* copy default values from device settings */
 	file_data->timeout = USBTMC_TIMEOUT;
+	file_data->term_char = data->TermChar;
+	file_data->term_char_enabled = data->TermCharEnabled;
 	file_data->eom_val = 1;
 
 	INIT_LIST_HEAD(&file_data->file_elem);
@@ -634,9 +639,9 @@ static int send_request_dev_dep_msg_in(struct usbtmc_file_data *file_data,
 	buffer[5] = transfer_size >> 8;
 	buffer[6] = transfer_size >> 16;
 	buffer[7] = transfer_size >> 24;
-	buffer[8] = data->TermCharEnabled * 2;
+	buffer[8] = file_data->term_char_enabled * 2;
 	/* Use term character? */
-	buffer[9] = data->TermChar;
+	buffer[9] = file_data->term_char;
 	buffer[10] = 0; /* Reserved */
 	buffer[11] = 0; /* Reserved */
 
@@ -1298,6 +1303,28 @@ static int usbtmc_ioctl_eom_enable(struct usbtmc_file_data *file_data,
 	return 0;
 }
 
+/*
+ * Configure termination character for read()
+ */
+static int usbtmc_ioctl_config_termc(struct usbtmc_file_data *file_data,
+				void __user *arg)
+{
+	struct usbtmc_termchar termc;
+
+	if (copy_from_user(&termc, arg, sizeof(termc)))
+		return -EFAULT;
+
+	if ((termc.term_char_enabled > 1) ||
+		(termc.term_char_enabled &&
+		!(file_data->data->capabilities.device_capabilities & 1)))
+		return -EINVAL;
+
+	file_data->term_char = termc.term_char;
+	file_data->term_char_enabled = termc.term_char_enabled;
+
+	return 0;
+}
+
 static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct usbtmc_file_data *file_data;
@@ -1353,6 +1380,11 @@ static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 						 (void __user *)arg);
 		break;
 
+	case USBTMC_IOCTL_CONFIG_TERMCHAR:
+		retval = usbtmc_ioctl_config_termc(file_data,
+						   (void __user *)arg);
+		break;
+
 	case USBTMC488_IOCTL_GET_CAPS:
 		retval = copy_to_user((void __user *)arg,
 				&data->usb488_caps,
diff --git a/include/uapi/linux/usb/tmc.h b/include/uapi/linux/usb/tmc.h
index e7317dfdd2ae..729af2f861a4 100644
--- a/include/uapi/linux/usb/tmc.h
+++ b/include/uapi/linux/usb/tmc.h
@@ -40,6 +40,11 @@
 #define USBTMC488_REQUEST_GOTO_LOCAL			161
 #define USBTMC488_REQUEST_LOCAL_LOCKOUT			162
 
+struct usbtmc_termchar {
+	__u8 term_char;
+	__u8 term_char_enabled;
+} __attribute__ ((packed));
+
 /* Request values for USBTMC driver's ioctl entry point */
 #define USBTMC_IOC_NR			91
 #define USBTMC_IOCTL_INDICATOR_PULSE	_IO(USBTMC_IOC_NR, 1)
@@ -51,6 +56,7 @@
 #define USBTMC_IOCTL_GET_TIMEOUT	_IOR(USBTMC_IOC_NR, 9, __u32)
 #define USBTMC_IOCTL_SET_TIMEOUT	_IOW(USBTMC_IOC_NR, 10, __u32)
 #define USBTMC_IOCTL_EOM_ENABLE	        _IOW(USBTMC_IOC_NR, 11, __u8)
+#define USBTMC_IOCTL_CONFIG_TERMCHAR	_IOW(USBTMC_IOC_NR, 12, struct usbtmc_termchar)
 
 #define USBTMC488_IOCTL_GET_CAPS	_IOR(USBTMC_IOC_NR, 17, unsigned char)
 #define USBTMC488_IOCTL_READ_STB	_IOR(USBTMC_IOC_NR, 18, unsigned char)
-- 
cgit v1.2.3


From c73314e6ebb2651a70ca8a3ff08d4bd6b9f9ade1 Mon Sep 17 00:00:00 2001
From: Mathieu Othacehe <m.othacehe@gmail.com>
Date: Fri, 20 Jul 2018 19:34:25 +0200
Subject: iio: Add channel for Phase

Add new channel type support for phase.

This channel may be used by Time-of-flight sensors to express the
phase difference between emitted and received signals. Those sensor
will then use the phase shift of return signals to approximate the
distance to objects.

Signed-off-by: Mathieu Othacehe <m.othacehe@gmail.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 Documentation/ABI/testing/sysfs-bus-iio | 7 +++++++
 drivers/iio/industrialio-core.c         | 1 +
 include/uapi/linux/iio/types.h          | 1 +
 tools/iio/iio_event_monitor.c           | 2 ++
 4 files changed, 11 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index c7353030670a..c9cfa833cf47 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1675,3 +1675,10 @@ KernelVersion:	4.12
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Raw counter device counters direction for channel Y.
+
+What:		/sys/bus/iio/devices/iio:deviceX/in_phaseY_raw
+KernelVersion:	4.18
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Raw (unscaled) phase difference reading from channel Y
+		that can be processed to radians.
\ No newline at end of file
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index ed1b3ebade94..a16ad5a4ab0c 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -86,6 +86,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_INDEX] = "index",
 	[IIO_GRAVITY]  = "gravity",
 	[IIO_POSITIONRELATIVE]  = "positionrelative",
+	[IIO_PHASE] = "phase",
 };
 
 static const char * const iio_modifier_names[] = {
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index 033c7d28924e..e4df3cc268db 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -45,6 +45,7 @@ enum iio_chan_type {
 	IIO_INDEX,
 	IIO_GRAVITY,
 	IIO_POSITIONRELATIVE,
+	IIO_PHASE,
 };
 
 enum iio_modifier {
diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c
index 148f69dfae75..f478f5558720 100644
--- a/tools/iio/iio_event_monitor.c
+++ b/tools/iio/iio_event_monitor.c
@@ -59,6 +59,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_UVINDEX] = "uvindex",
 	[IIO_GRAVITY] = "gravity",
 	[IIO_POSITIONRELATIVE] = "positionrelative",
+	[IIO_PHASE] = "phase",
 };
 
 static const char * const iio_ev_type_text[] = {
@@ -153,6 +154,7 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_UVINDEX:
 	case IIO_GRAVITY:
 	case IIO_POSITIONRELATIVE:
+	case IIO_PHASE:
 		break;
 	default:
 		return false;
-- 
cgit v1.2.3


From be26b3a73413c2ebf14d5e76a66ad964e6458080 Mon Sep 17 00:00:00 2001
From: Dongjiu Geng <gengdongjiu@huawei.com>
Date: Thu, 19 Jul 2018 16:24:23 +0100
Subject: arm64: KVM: export the capability to set guest SError syndrome

For the arm64 RAS Extension, user space can inject a virtual-SError
with specified ESR. So user space needs to know whether KVM support
to inject such SError, this interface adds this query for this capability.

KVM will check whether system support RAS Extension, if supported, KVM
returns true to user space, otherwise returns false.

Signed-off-by: Dongjiu Geng <gengdongjiu@huawei.com>
Reviewed-by: James Morse <james.morse@arm.com>
[expanded documentation wording]
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/virtual/kvm/api.txt | 26 ++++++++++++++++++++++++++
 arch/arm64/kvm/reset.c            |  3 +++
 include/uapi/linux/kvm.h          |  1 +
 3 files changed, 30 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 284d36e72f28..dbbb95d5798a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -907,6 +907,18 @@ SError is pending, the architecture's 'Multiple SError interrupts' rules should
 be followed. (2.5.3 of DDI0587.a "ARM Reliability, Availability, and
 Serviceability (RAS) Specification").
 
+SError exceptions always have an ESR value. Some CPUs have the ability to
+specify what the virtual SError's ESR value should be. These systems will
+advertise KVM_CAP_ARM_SET_SERROR_ESR. In this case exception.has_esr will
+always have a non-zero value when read, and the agent making an SError pending
+should specify the ISS field in the lower 24 bits of exception.serror_esr. If
+the system supports KVM_CAP_ARM_SET_SERROR_ESR, but user-space sets the events
+with exception.has_esr as zero, KVM will choose an ESR.
+
+Specifying exception.has_esr on a system that does not support it will return
+-EINVAL. Setting anything other than the lower 24bits of exception.serror_esr
+will return -EINVAL.
+
 struct kvm_vcpu_events {
 	struct {
 		__u8 serror_pending;
@@ -4664,3 +4676,17 @@ This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush
 hypercalls:
 HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx,
 HvFlushVirtualAddressList, HvFlushVirtualAddressListEx.
+
+8.19 KVM_CAP_ARM_SET_SERROR_ESR
+
+Architectures: arm, arm64
+
+This capability indicates that userspace can specify (via the
+KVM_SET_VCPU_EVENTS ioctl) the syndrome value reported to the guest when it
+takes a virtual SError interrupt exception.
+If KVM advertises this capability, userspace can only specify the ISS field for
+the ESR syndrome. Other parts of the ESR, such as the EC are generated by the
+CPU when the exception is taken. If this virtual SError is taken to EL1 using
+AArch64, this value will be reported in the ISS field of ESR_ELx.
+
+See KVM_CAP_VCPU_EVENTS for more details.
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index a3db01a28062..067c6ba969bd 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -77,6 +77,9 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ARM_PMU_V3:
 		r = kvm_arm_support_pmu_v3();
 		break;
+	case KVM_CAP_ARM_INJECT_SERROR_ESR:
+		r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN);
+		break;
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_VCPU_ATTRIBUTES:
 	case KVM_CAP_VCPU_EVENTS:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index b6270a3b38e9..a7d9bc4e4068 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -949,6 +949,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_GET_MSR_FEATURES 153
 #define KVM_CAP_HYPERV_EVENTFD 154
 #define KVM_CAP_HYPERV_TLBFLUSH 155
+#define KVM_CAP_ARM_INJECT_SERROR_ESR 156
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 2756f68c314917d03eb348084edb08bb929139d9 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 23 Jul 2018 11:16:59 +0300
Subject: net: bridge: add support for backup port

This patch adds a new port attribute - IFLA_BRPORT_BACKUP_PORT, which
allows to set a backup port to be used for known unicast traffic if the
port has gone carrier down. The backup pointer is rcu protected and set
only under RTNL, a counter is maintained so when deleting a port we know
how many other ports reference it as a backup and we remove it from all.
Also the pointer is in the first cache line which is hot at the time of
the check and thus in the common case we only add one more test.
The backup port will be used only for the non-flooding case since
it's a part of the bridge and the flooded packets will be forwarded to it
anyway. To remove the forwarding just send a 0/non-existing backup port.
This is used to avoid numerous scalability problems when using MLAG most
notably if we have thousands of fdbs one would need to change all of them
on port carrier going down which takes too long and causes a storm of fdb
notifications (and again when the port comes back up). In a Multi-chassis
Link Aggregation setup usually hosts are connected to two different
switches which act as a single logical switch. Those switches usually have
a control and backup link between them called peerlink which might be used
for communication in case a host loses connectivity to one of them.
We need a fast way to failover in case a host port goes down and currently
none of the solutions (like bond) cannot fulfill the requirements because
the participating ports are actually the "master" devices and must have the
same peerlink as their backup interface and at the same time all of them
must participate in the bridge device. As Roopa noted it's normal practice
in routing called fast re-route where a precalculated backup path is used
when the main one is down.
Another use case of this is with EVPN, having a single vxlan device which
is backup of every port. Due to the nature of master devices it's not
currently possible to use one device as a backup for many and still have
all of them participate in the bridge (which is master itself).
More detailed information about MLAG is available at the link below.
https://docs.cumulusnetworks.com/display/DOCS/Multi-Chassis+Link+Aggregation+-+MLAG

Further explanation and a diagram by Roopa:
Two switches acting in a MLAG pair are connected by the peerlink
interface which is a bridge port.

the config on one of the switches looks like the below. The other
switch also has a similar config.
eth0 is connected to one port on the server. And the server is
connected to both switches.

br0 -- team0---eth0
      |
      -- switch-peerlink

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_forward.c      | 16 ++++++++++++-
 net/bridge/br_if.c           | 53 ++++++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_netlink.c      | 30 ++++++++++++++++++++++++-
 net/bridge/br_private.h      |  3 +++
 net/bridge/br_sysfs_if.c     | 33 +++++++++++++++++++++++++++
 6 files changed, 134 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8759cfb8aa2e..01b5069a73a5 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -334,6 +334,7 @@ enum {
 	IFLA_BRPORT_GROUP_FWD_MASK,
 	IFLA_BRPORT_NEIGH_SUPPRESS,
 	IFLA_BRPORT_ISOLATED,
+	IFLA_BRPORT_BACKUP_PORT,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 9019f326fe81..5372e2042adf 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -142,7 +142,20 @@ static int deliver_clone(const struct net_bridge_port *prev,
 void br_forward(const struct net_bridge_port *to,
 		struct sk_buff *skb, bool local_rcv, bool local_orig)
 {
-	if (to && should_deliver(to, skb)) {
+	if (unlikely(!to))
+		goto out;
+
+	/* redirect to backup link if the destination port is down */
+	if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) {
+		struct net_bridge_port *backup_port;
+
+		backup_port = rcu_dereference(to->backup_port);
+		if (unlikely(!backup_port))
+			goto out;
+		to = backup_port;
+	}
+
+	if (should_deliver(to, skb)) {
 		if (local_rcv)
 			deliver_clone(to, skb, local_orig);
 		else
@@ -150,6 +163,7 @@ void br_forward(const struct net_bridge_port *to,
 		return;
 	}
 
+out:
 	if (!local_rcv)
 		kfree_skb(skb);
 }
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index e7c8d55212aa..0363f1bdc401 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -170,6 +170,58 @@ void br_manage_promisc(struct net_bridge *br)
 	}
 }
 
+int nbp_backup_change(struct net_bridge_port *p,
+		      struct net_device *backup_dev)
+{
+	struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port);
+	struct net_bridge_port *backup_p = NULL;
+
+	ASSERT_RTNL();
+
+	if (backup_dev) {
+		if (!br_port_exists(backup_dev))
+			return -ENOENT;
+
+		backup_p = br_port_get_rtnl(backup_dev);
+		if (backup_p->br != p->br)
+			return -EINVAL;
+	}
+
+	if (p == backup_p)
+		return -EINVAL;
+
+	if (old_backup == backup_p)
+		return 0;
+
+	/* if the backup link is already set, clear it */
+	if (old_backup)
+		old_backup->backup_redirected_cnt--;
+
+	if (backup_p)
+		backup_p->backup_redirected_cnt++;
+	rcu_assign_pointer(p->backup_port, backup_p);
+
+	return 0;
+}
+
+static void nbp_backup_clear(struct net_bridge_port *p)
+{
+	nbp_backup_change(p, NULL);
+	if (p->backup_redirected_cnt) {
+		struct net_bridge_port *cur_p;
+
+		list_for_each_entry(cur_p, &p->br->port_list, list) {
+			struct net_bridge_port *backup_p;
+
+			backup_p = rtnl_dereference(cur_p->backup_port);
+			if (backup_p == p)
+				nbp_backup_change(cur_p, NULL);
+		}
+	}
+
+	WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt);
+}
+
 static void nbp_update_port_count(struct net_bridge *br)
 {
 	struct net_bridge_port *p;
@@ -295,6 +347,7 @@ static void del_nbp(struct net_bridge_port *p)
 	nbp_vlan_flush(p);
 	br_fdb_delete_by_port(br, p, 0, 1);
 	switchdev_deferred_process();
+	nbp_backup_clear(p);
 
 	nbp_update_port_count(br);
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 9f5eb05b0373..ec2b58a09f76 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -169,13 +169,15 @@ static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask)
 		+ nla_total_size(1) /* IFLA_OPERSTATE */
 		+ nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */
 		+ nla_total_size(br_get_link_af_size_filtered(dev,
-				 filter_mask)); /* IFLA_AF_SPEC */
+				 filter_mask)) /* IFLA_AF_SPEC */
+		+ nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */
 }
 
 static int br_port_fill_attrs(struct sk_buff *skb,
 			      const struct net_bridge_port *p)
 {
 	u8 mode = !!(p->flags & BR_HAIRPIN_MODE);
+	struct net_bridge_port *backup_p;
 	u64 timerval;
 
 	if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) ||
@@ -237,6 +239,14 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 		return -EMSGSIZE;
 #endif
 
+	/* we might be called only with br->lock */
+	rcu_read_lock();
+	backup_p = rcu_dereference(p->backup_port);
+	if (backup_p)
+		nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT,
+			    backup_p->dev->ifindex);
+	rcu_read_unlock();
+
 	return 0;
 }
 
@@ -663,6 +673,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
 	[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
 	[IFLA_BRPORT_ISOLATED]	= { .type = NLA_U8 },
+	[IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -817,6 +828,23 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	if (err)
 		return err;
 
+	if (tb[IFLA_BRPORT_BACKUP_PORT]) {
+		struct net_device *backup_dev = NULL;
+		u32 backup_ifindex;
+
+		backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]);
+		if (backup_ifindex) {
+			backup_dev = __dev_get_by_index(dev_net(p->dev),
+							backup_ifindex);
+			if (!backup_dev)
+				return -ENOENT;
+		}
+
+		err = nbp_backup_change(p, backup_dev);
+		if (err)
+			return err;
+	}
+
 	br_port_flags_change(p, old_flags ^ p->flags);
 	return 0;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index cf0005d2a4d0..11ed2029985f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -237,6 +237,7 @@ struct net_bridge_port {
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
 	struct net_bridge_vlan_group	__rcu *vlgrp;
 #endif
+	struct net_bridge_port		__rcu *backup_port;
 
 	/* STP */
 	u8				priority;
@@ -281,6 +282,7 @@ struct net_bridge_port {
 	int				offload_fwd_mark;
 #endif
 	u16				group_fwd_mask;
+	u16				backup_redirected_cnt;
 };
 
 #define kobj_to_brport(obj)	container_of(obj, struct net_bridge_port, kobj)
@@ -597,6 +599,7 @@ netdev_features_t br_features_recompute(struct net_bridge *br,
 					netdev_features_t features);
 void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
 void br_manage_promisc(struct net_bridge *br);
+int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev);
 
 /* br_input.c */
 int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 4ac940067754..7c87a2fe5248 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -191,6 +191,38 @@ static int store_group_fwd_mask(struct net_bridge_port *p,
 static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask,
 		   store_group_fwd_mask);
 
+static ssize_t show_backup_port(struct net_bridge_port *p, char *buf)
+{
+	struct net_bridge_port *backup_p;
+	int ret = 0;
+
+	rcu_read_lock();
+	backup_p = rcu_dereference(p->backup_port);
+	if (backup_p)
+		ret = sprintf(buf, "%s\n", backup_p->dev->name);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int store_backup_port(struct net_bridge_port *p, char *buf)
+{
+	struct net_device *backup_dev = NULL;
+	char *nl = strchr(buf, '\n');
+
+	if (nl)
+		*nl = '\0';
+
+	if (strlen(buf) > 0) {
+		backup_dev = __dev_get_by_name(dev_net(p->dev), buf);
+		if (!backup_dev)
+			return -ENOENT;
+	}
+
+	return nbp_backup_change(p, backup_dev);
+}
+static BRPORT_ATTR_RAW(backup_port, 0644, show_backup_port, store_backup_port);
+
 BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
 BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
 BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
@@ -254,6 +286,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_group_fwd_mask,
 	&brport_attr_neigh_suppress,
 	&brport_attr_isolated,
+	&brport_attr_backup_port,
 	NULL
 };
 
-- 
cgit v1.2.3


From c601171d7a60b5b09d7c2fe0579953323a80744e Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Mon, 23 Jul 2018 13:53:08 +0200
Subject: net/smc: provide smc mode in smc_diag.c

Rename field diag_fallback into diag_mode and set the smc mode of a
connection explicitly.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/smc_diag.h | 9 ++++++++-
 net/smc/smc_diag.c            | 7 ++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h
index 92be255e534c..48ae3ee22b2d 100644
--- a/include/uapi/linux/smc_diag.h
+++ b/include/uapi/linux/smc_diag.h
@@ -20,7 +20,7 @@ struct smc_diag_req {
 struct smc_diag_msg {
 	__u8	diag_family;
 	__u8	diag_state;
-	__u8	diag_fallback;
+	__u8	diag_mode;
 	__u8	diag_shutdown;
 	struct inet_diag_sockid id;
 
@@ -28,6 +28,13 @@ struct smc_diag_msg {
 	__u64	diag_inode;
 };
 
+/* Mode of a connection */
+enum {
+	SMC_DIAG_MODE_SMCR,
+	SMC_DIAG_MODE_FALLBACK_TCP,
+	SMC_DIAG_MODE_SMCD,
+};
+
 /* Extensions */
 
 enum {
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 6d83eef1b743..d772cd10297e 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -91,7 +91,12 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
 	r = nlmsg_data(nlh);
 	smc_diag_msg_common_fill(r, sk);
 	r->diag_state = sk->sk_state;
-	r->diag_fallback = smc->use_fallback;
+	if (smc->use_fallback)
+		r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP;
+	else if (smc->conn.lgr && smc->conn.lgr->is_smcd)
+		r->diag_mode = SMC_DIAG_MODE_SMCD;
+	else
+		r->diag_mode = SMC_DIAG_MODE_SMCR;
 	user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
 	if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
 		goto errout;
-- 
cgit v1.2.3


From c0e4e0fd952b73bf6aae67e92b9a496a52837eb9 Mon Sep 17 00:00:00 2001
From: Maxime Roussin-Bélanger <maxime.roussinbelanger@gmail.com>
Date: Thu, 19 Jul 2018 16:26:24 -0400
Subject: iio: Add modifier for DUV light
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maxime Roussin-Bélanger <maxime.roussinbelanger@gmail.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 Documentation/ABI/testing/sysfs-bus-iio | 7 +++++--
 drivers/iio/industrialio-core.c         | 1 +
 include/uapi/linux/iio/types.h          | 1 +
 tools/iio/iio_event_monitor.c           | 2 ++
 4 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index c9cfa833cf47..a5b4f223641d 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1307,13 +1307,16 @@ What:		/sys/.../iio:deviceX/in_intensityY_raw
 What:		/sys/.../iio:deviceX/in_intensityY_ir_raw
 What:		/sys/.../iio:deviceX/in_intensityY_both_raw
 What:		/sys/.../iio:deviceX/in_intensityY_uv_raw
+What:		/sys/.../iio:deviceX/in_intensityY_duv_raw
 KernelVersion:	3.4
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Unit-less light intensity. Modifiers both and ir indicate
 		that measurements contain visible and infrared light
-		components or just infrared light, respectively. Modifier uv indicates
-		that measurements contain ultraviolet light components.
+		components or just infrared light, respectively. Modifier
+		uv indicates that measurements contain ultraviolet light
+		components. Modifier duv indicates that measurements
+		contain deep ultraviolet light components.
 
 What:		/sys/.../iio:deviceX/in_uvindex_input
 KernelVersion:	4.6
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index a16ad5a4ab0c..a062cfddc5af 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -110,6 +110,7 @@ static const char * const iio_modifier_names[] = {
 	[IIO_MOD_LIGHT_GREEN] = "green",
 	[IIO_MOD_LIGHT_BLUE] = "blue",
 	[IIO_MOD_LIGHT_UV] = "uv",
+	[IIO_MOD_LIGHT_DUV] = "duv",
 	[IIO_MOD_QUATERNION] = "quaternion",
 	[IIO_MOD_TEMP_AMBIENT] = "ambient",
 	[IIO_MOD_TEMP_OBJECT] = "object",
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index e4df3cc268db..92baabc103ac 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -86,6 +86,7 @@ enum iio_modifier {
 	IIO_MOD_CO2,
 	IIO_MOD_VOC,
 	IIO_MOD_LIGHT_UV,
+	IIO_MOD_LIGHT_DUV,
 };
 
 enum iio_event_type {
diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c
index f478f5558720..ac2de6b7e89f 100644
--- a/tools/iio/iio_event_monitor.c
+++ b/tools/iio/iio_event_monitor.c
@@ -98,6 +98,7 @@ static const char * const iio_modifier_names[] = {
 	[IIO_MOD_LIGHT_GREEN] = "green",
 	[IIO_MOD_LIGHT_BLUE] = "blue",
 	[IIO_MOD_LIGHT_UV] = "uv",
+	[IIO_MOD_LIGHT_DUV] = "duv",
 	[IIO_MOD_QUATERNION] = "quaternion",
 	[IIO_MOD_TEMP_AMBIENT] = "ambient",
 	[IIO_MOD_TEMP_OBJECT] = "object",
@@ -182,6 +183,7 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_MOD_LIGHT_GREEN:
 	case IIO_MOD_LIGHT_BLUE:
 	case IIO_MOD_LIGHT_UV:
+	case IIO_MOD_LIGHT_DUV:
 	case IIO_MOD_QUATERNION:
 	case IIO_MOD_TEMP_AMBIENT:
 	case IIO_MOD_TEMP_OBJECT:
-- 
cgit v1.2.3


From 32a4f5ecd7381f30ae3bb36dea77a150ba68af2e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 23 Jul 2018 09:23:06 +0200
Subject: net: sched: introduce chain object to uapi

Allow user to create, destroy, get and dump chain objects. Do that by
extending rtnl commands by the chain-specific ones. User will now be
able to explicitly create or destroy chains (so far this was done only
automatically according the filter/act needs and refcounting). Also, the
user will receive notification about any chain creation or destuction.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h      |   1 +
 include/uapi/linux/rtnetlink.h |   7 +
 net/sched/cls_api.c            | 308 +++++++++++++++++++++++++++++++++++++++--
 security/selinux/nlmsgtab.c    |   2 +-
 4 files changed, 309 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 86f4651784e8..81ec8276db9c 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -304,6 +304,7 @@ struct tcf_chain {
 	struct tcf_block *block;
 	u32 index; /* chain index */
 	unsigned int refcnt;
+	bool explicitly_created;
 };
 
 struct tcf_block {
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 7d8502313c99..46399367627f 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -150,6 +150,13 @@ enum {
 	RTM_NEWCACHEREPORT = 96,
 #define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT
 
+	RTM_NEWCHAIN = 100,
+#define RTM_NEWCHAIN RTM_NEWCHAIN
+	RTM_DELCHAIN,
+#define RTM_DELCHAIN RTM_DELCHAIN
+	RTM_GETCHAIN,
+#define RTM_GETCHAIN RTM_GETCHAIN
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index eb0bf9037ef9..e65b390336aa 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -262,29 +262,57 @@ static void tcf_chain_hold(struct tcf_chain *chain)
 	++chain->refcnt;
 }
 
-struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
-				bool create)
+static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
+					  u32 chain_index)
 {
 	struct tcf_chain *chain;
 
 	list_for_each_entry(chain, &block->chain_list, list) {
-		if (chain->index == chain_index) {
-			tcf_chain_hold(chain);
+		if (chain->index == chain_index)
 			return chain;
-		}
+	}
+	return NULL;
+}
+
+static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
+			   u32 seq, u16 flags, int event, bool unicast);
+
+struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
+				bool create)
+{
+	struct tcf_chain *chain = tcf_chain_lookup(block, chain_index);
+
+	if (chain) {
+		tcf_chain_hold(chain);
+		return chain;
 	}
 
-	return create ? tcf_chain_create(block, chain_index) : NULL;
+	if (!create)
+		return NULL;
+	chain = tcf_chain_create(block, chain_index);
+	if (!chain)
+		return NULL;
+	tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
+			RTM_NEWCHAIN, false);
+	return chain;
 }
 EXPORT_SYMBOL(tcf_chain_get);
 
 void tcf_chain_put(struct tcf_chain *chain)
 {
-	if (--chain->refcnt == 0)
+	if (--chain->refcnt == 0) {
+		tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false);
 		tcf_chain_destroy(chain);
+	}
 }
 EXPORT_SYMBOL(tcf_chain_put);
 
+static void tcf_chain_put_explicitly_created(struct tcf_chain *chain)
+{
+	if (chain->explicitly_created)
+		tcf_chain_put(chain);
+}
+
 static bool tcf_block_offload_in_use(struct tcf_block *block)
 {
 	return block->offloadcnt;
@@ -694,8 +722,10 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 
 	if (block->refcnt == 1) {
 		/* At this point, all the chains should have refcnt >= 1. */
-		list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+		list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
+			tcf_chain_put_explicitly_created(chain);
 			tcf_chain_put(chain);
+		}
 
 		block->refcnt--;
 		if (list_empty(&block->chain_list))
@@ -1609,6 +1639,264 @@ out:
 	return skb->len;
 }
 
+static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net,
+			      struct sk_buff *skb, struct tcf_block *block,
+			      u32 portid, u32 seq, u16 flags, int event)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlmsghdr *nlh;
+	struct tcmsg *tcm;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	tcm = nlmsg_data(nlh);
+	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm__pad1 = 0;
+	tcm->tcm__pad2 = 0;
+	tcm->tcm_handle = 0;
+	if (block->q) {
+		tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex;
+		tcm->tcm_parent = block->q->handle;
+	} else {
+		tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK;
+		tcm->tcm_block_index = block->index;
+	}
+
+	if (nla_put_u32(skb, TCA_CHAIN, chain->index))
+		goto nla_put_failure;
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -EMSGSIZE;
+}
+
+static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
+			   u32 seq, u16 flags, int event, bool unicast)
+{
+	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+	struct tcf_block *block = chain->block;
+	struct net *net = block->net;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	if (tc_chain_fill_node(chain, net, skb, block, portid,
+			       seq, flags, event) <= 0) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	if (unicast)
+		return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
+	return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO);
+}
+
+/* Add/delete/get a chain */
+
+static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n,
+			struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tca[TCA_MAX + 1];
+	struct tcmsg *t;
+	u32 parent;
+	u32 chain_index;
+	struct Qdisc *q = NULL;
+	struct tcf_chain *chain = NULL;
+	struct tcf_block *block;
+	unsigned long cl;
+	int err;
+
+	if (n->nlmsg_type != RTM_GETCHAIN &&
+	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+replay:
+	err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
+	if (err < 0)
+		return err;
+
+	t = nlmsg_data(n);
+	parent = t->tcm_parent;
+	cl = 0;
+
+	block = tcf_block_find(net, &q, &parent, &cl,
+			       t->tcm_ifindex, t->tcm_block_index, extack);
+	if (IS_ERR(block))
+		return PTR_ERR(block);
+
+	chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+	if (chain_index > TC_ACT_EXT_VAL_MASK) {
+		NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
+		return -EINVAL;
+	}
+	chain = tcf_chain_lookup(block, chain_index);
+	if (n->nlmsg_type == RTM_NEWCHAIN) {
+		if (chain) {
+			NL_SET_ERR_MSG(extack, "Filter chain already exists");
+			return -EEXIST;
+		}
+		if (!(n->nlmsg_flags & NLM_F_CREATE)) {
+			NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain");
+			return -ENOENT;
+		}
+		chain = tcf_chain_create(block, chain_index);
+		if (!chain) {
+			NL_SET_ERR_MSG(extack, "Failed to create filter chain");
+			return -ENOMEM;
+		}
+	} else {
+		if (!chain) {
+			NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
+			return -EINVAL;
+		}
+		tcf_chain_hold(chain);
+	}
+
+	switch (n->nlmsg_type) {
+	case RTM_NEWCHAIN:
+		/* In case the chain was successfully added, take a reference
+		 * to the chain. This ensures that an empty chain
+		 * does not disappear at the end of this function.
+		 */
+		tcf_chain_hold(chain);
+		chain->explicitly_created = true;
+		tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
+				RTM_NEWCHAIN, false);
+		break;
+	case RTM_DELCHAIN:
+		/* Flush the chain first as the user requested chain removal. */
+		tcf_chain_flush(chain);
+		/* In case the chain was successfully deleted, put a reference
+		 * to the chain previously taken during addition.
+		 */
+		tcf_chain_put_explicitly_created(chain);
+		break;
+	case RTM_GETCHAIN:
+		break;
+		err = tc_chain_notify(chain, skb, n->nlmsg_seq,
+				      n->nlmsg_seq, n->nlmsg_type, true);
+		if (err < 0)
+			NL_SET_ERR_MSG(extack, "Failed to send chain notify message");
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		NL_SET_ERR_MSG(extack, "Unsupported message type");
+		goto errout;
+	}
+
+errout:
+	tcf_chain_put(chain);
+	if (err == -EAGAIN)
+		/* Replay the request. */
+		goto replay;
+	return err;
+}
+
+/* called with RTNL */
+static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tca[TCA_MAX + 1];
+	struct Qdisc *q = NULL;
+	struct tcf_block *block;
+	struct tcf_chain *chain;
+	struct tcmsg *tcm = nlmsg_data(cb->nlh);
+	long index_start;
+	long index;
+	u32 parent;
+	int err;
+
+	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
+		return skb->len;
+
+	err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL);
+	if (err)
+		return err;
+
+	if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+		block = tcf_block_lookup(net, tcm->tcm_block_index);
+		if (!block)
+			goto out;
+		/* If we work with block index, q is NULL and parent value
+		 * will never be used in the following code. The check
+		 * in tcf_fill_node prevents it. However, compiler does not
+		 * see that far, so set parent to zero to silence the warning
+		 * about parent being uninitialized.
+		 */
+		parent = 0;
+	} else {
+		const struct Qdisc_class_ops *cops;
+		struct net_device *dev;
+		unsigned long cl = 0;
+
+		dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+		if (!dev)
+			return skb->len;
+
+		parent = tcm->tcm_parent;
+		if (!parent) {
+			q = dev->qdisc;
+			parent = q->handle;
+		} else {
+			q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+		}
+		if (!q)
+			goto out;
+		cops = q->ops->cl_ops;
+		if (!cops)
+			goto out;
+		if (!cops->tcf_block)
+			goto out;
+		if (TC_H_MIN(tcm->tcm_parent)) {
+			cl = cops->find(q, tcm->tcm_parent);
+			if (cl == 0)
+				goto out;
+		}
+		block = cops->tcf_block(q, cl, NULL);
+		if (!block)
+			goto out;
+		if (tcf_block_shared(block))
+			q = NULL;
+	}
+
+	index_start = cb->args[0];
+	index = 0;
+
+	list_for_each_entry(chain, &block->chain_list, list) {
+		if ((tca[TCA_CHAIN] &&
+		     nla_get_u32(tca[TCA_CHAIN]) != chain->index))
+			continue;
+		if (index < index_start) {
+			index++;
+			continue;
+		}
+		err = tc_chain_fill_node(chain, net, skb, block,
+					 NETLINK_CB(cb->skb).portid,
+					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
+					 RTM_NEWCHAIN);
+		if (err <= 0)
+			break;
+		index++;
+	}
+
+	cb->args[0] = index;
+
+out:
+	/* If we did no progress, the error (EMSGSIZE) is real */
+	if (skb->len == 0 && err)
+		return err;
+	return skb->len;
+}
+
 void tcf_exts_destroy(struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
@@ -1825,6 +2113,10 @@ static int __init tc_filter_init(void)
 	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
 		      tc_dump_tfilter, 0);
+	rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain,
+		      tc_dump_chain, 0);
 
 	return 0;
 
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 7b7433a1a34c..74b951f55608 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -159,7 +159,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
 	switch (sclass) {
 	case SECCLASS_NETLINK_ROUTE_SOCKET:
 		/* RTM_MAX always point to RTM_SETxxxx, ie RTM_NEWxxx + 3 */
-		BUILD_BUG_ON(RTM_MAX != (RTM_NEWCACHEREPORT + 3));
+		BUILD_BUG_ON(RTM_MAX != (RTM_NEWCHAIN + 3));
 		err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
 				 sizeof(nlmsg_route_perms));
 		break;
-- 
cgit v1.2.3


From b7ff8b1036f0b0df1390ba6b5e9bc7ec458e857a Mon Sep 17 00:00:00 2001
From: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
Date: Mon, 23 Jul 2018 20:51:23 -0700
Subject: rds: Extend RDS API for IPv6 support

There are many data structures (RDS socket options) used by RDS apps
which use a 32 bit integer to store IP address. To support IPv6,
struct in6_addr needs to be used. To ensure backward compatibility, a
new data structure is introduced for each of those data structures
which use a 32 bit integer to represent an IP address. And new socket
options are introduced to use those new structures. This means that
existing apps should work without a problem with the new RDS module.
For apps which want to use IPv6, those new data structures and socket
options can be used. IPv4 mapped address is used to represent IPv4
address in the new data structures.

v4: Revert changes to SO_RDS_TRANSPORT

Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rds.h |  69 +++++++++++++++++++++++++++++++-
 net/rds/connection.c     | 101 +++++++++++++++++++++++++++++++++++++++++++----
 net/rds/ib.c             |  52 ++++++++++++++++++++++++
 net/rds/ib_mr.h          |   2 +
 net/rds/ib_rdma.c        |  11 +++++-
 net/rds/recv.c           |  25 ++++++++++++
 net/rds/tcp.c            |  44 +++++++++++++++++++++
 7 files changed, 293 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 20c6bd0b0007..dc520e1a4123 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
 /*
- * Copyright (c) 2008 Oracle.  All rights reserved.
+ * Copyright (c) 2008, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -118,7 +118,17 @@
 #define RDS_INFO_IB_CONNECTIONS		10008
 #define RDS_INFO_CONNECTION_STATS	10009
 #define RDS_INFO_IWARP_CONNECTIONS	10010
-#define RDS_INFO_LAST			10010
+
+/* PF_RDS6 options */
+#define RDS6_INFO_CONNECTIONS		10011
+#define RDS6_INFO_SEND_MESSAGES		10012
+#define RDS6_INFO_RETRANS_MESSAGES	10013
+#define RDS6_INFO_RECV_MESSAGES		10014
+#define RDS6_INFO_SOCKETS		10015
+#define RDS6_INFO_TCP_SOCKETS		10016
+#define RDS6_INFO_IB_CONNECTIONS	10017
+
+#define RDS_INFO_LAST			10017
 
 struct rds_info_counter {
 	__u8	name[32];
@@ -140,6 +150,15 @@ struct rds_info_connection {
 	__u8		flags;
 } __attribute__((packed));
 
+struct rds6_info_connection {
+	__u64		next_tx_seq;
+	__u64		next_rx_seq;
+	struct in6_addr	laddr;
+	struct in6_addr	faddr;
+	__u8		transport[TRANSNAMSIZ];		/* null term ascii */
+	__u8		flags;
+} __attribute__((packed));
+
 #define RDS_INFO_MESSAGE_FLAG_ACK               0x01
 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK          0x02
 
@@ -153,6 +172,17 @@ struct rds_info_message {
 	__u8		flags;
 } __attribute__((packed));
 
+struct rds6_info_message {
+	__u64	seq;
+	__u32	len;
+	struct in6_addr	laddr;
+	struct in6_addr	faddr;
+	__be16		lport;
+	__be16		fport;
+	__u8		flags;
+	__u8		tos;
+} __attribute__((packed));
+
 struct rds_info_socket {
 	__u32		sndbuf;
 	__be32		bound_addr;
@@ -163,6 +193,16 @@ struct rds_info_socket {
 	__u64		inum;
 } __attribute__((packed));
 
+struct rds6_info_socket {
+	__u32		sndbuf;
+	struct in6_addr	bound_addr;
+	struct in6_addr	connected_addr;
+	__be16		bound_port;
+	__be16		connected_port;
+	__u32		rcvbuf;
+	__u64		inum;
+} __attribute__((packed));
+
 struct rds_info_tcp_socket {
 	__be32          local_addr;
 	__be16          local_port;
@@ -175,6 +215,18 @@ struct rds_info_tcp_socket {
 	__u32           last_seen_una;
 } __attribute__((packed));
 
+struct rds6_info_tcp_socket {
+	struct in6_addr	local_addr;
+	__be16		local_port;
+	struct in6_addr	peer_addr;
+	__be16		peer_port;
+	__u64		hdr_rem;
+	__u64		data_rem;
+	__u32		last_sent_nxt;
+	__u32		last_expected_una;
+	__u32		last_seen_una;
+} __attribute__((packed));
+
 #define RDS_IB_GID_LEN	16
 struct rds_info_rdma_connection {
 	__be32		src_addr;
@@ -189,6 +241,19 @@ struct rds_info_rdma_connection {
 	__u32		rdma_mr_size;
 };
 
+struct rds6_info_rdma_connection {
+	struct in6_addr	src_addr;
+	struct in6_addr	dst_addr;
+	__u8		src_gid[RDS_IB_GID_LEN];
+	__u8		dst_gid[RDS_IB_GID_LEN];
+
+	__u32		max_send_wr;
+	__u32		max_recv_wr;
+	__u32		max_send_sge;
+	__u32		rdma_mr_max;
+	__u32		rdma_mr_size;
+};
+
 /* RDS message Receive Path Latency points */
 enum rds_message_rxpath_latency {
 	RDS_MSG_RX_HDR_TO_DGRAM_START = 0,
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 5c9ceed55dae..051e35c1e7c6 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -498,16 +498,19 @@ EXPORT_SYMBOL_GPL(rds_conn_destroy);
 
 static void __rds_inc_msg_cp(struct rds_incoming *inc,
 			     struct rds_info_iterator *iter,
-			     void *saddr, void *daddr, int flip)
+			     void *saddr, void *daddr, int flip, bool isv6)
 {
-	rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
-			  *(__be32 *)daddr, flip);
+	if (isv6)
+		rds6_inc_info_copy(inc, iter, saddr, daddr, flip);
+	else
+		rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
+				  *(__be32 *)daddr, flip);
 }
 
 static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
 				      struct rds_info_iterator *iter,
 				      struct rds_info_lengths *lens,
-				      int want_send)
+				      int want_send, bool isv6)
 {
 	struct hlist_head *head;
 	struct list_head *list;
@@ -518,7 +521,10 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
 	size_t i;
 	int j;
 
-	len /= sizeof(struct rds_info_message);
+	if (isv6)
+		len /= sizeof(struct rds6_info_message);
+	else
+		len /= sizeof(struct rds_info_message);
 
 	rcu_read_lock();
 
@@ -528,6 +534,9 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
 			struct rds_conn_path *cp;
 			int npaths;
 
+			if (!isv6 && conn->c_isv6)
+				continue;
+
 			npaths = (conn->c_trans->t_mp_capable ?
 				 RDS_MPATH_WORKERS : 1);
 
@@ -548,7 +557,7 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
 								 iter,
 								 &conn->c_laddr,
 								 &conn->c_faddr,
-								 0);
+								 0, isv6);
 				}
 
 				spin_unlock_irqrestore(&cp->cp_lock, flags);
@@ -558,7 +567,10 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
 	rcu_read_unlock();
 
 	lens->nr = total;
-	lens->each = sizeof(struct rds_info_message);
+	if (isv6)
+		lens->each = sizeof(struct rds6_info_message);
+	else
+		lens->each = sizeof(struct rds_info_message);
 }
 
 static void rds_conn_message_info(struct socket *sock, unsigned int len,
@@ -566,7 +578,15 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 				  struct rds_info_lengths *lens,
 				  int want_send)
 {
-	rds_conn_message_info_cmn(sock, len, iter, lens, want_send);
+	rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false);
+}
+
+static void rds6_conn_message_info(struct socket *sock, unsigned int len,
+				   struct rds_info_iterator *iter,
+				   struct rds_info_lengths *lens,
+				   int want_send)
+{
+	rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true);
 }
 
 static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
@@ -576,6 +596,13 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
 	rds_conn_message_info(sock, len, iter, lens, 1);
 }
 
+static void rds6_conn_message_info_send(struct socket *sock, unsigned int len,
+					struct rds_info_iterator *iter,
+					struct rds_info_lengths *lens)
+{
+	rds6_conn_message_info(sock, len, iter, lens, 1);
+}
+
 static void rds_conn_message_info_retrans(struct socket *sock,
 					  unsigned int len,
 					  struct rds_info_iterator *iter,
@@ -584,6 +611,14 @@ static void rds_conn_message_info_retrans(struct socket *sock,
 	rds_conn_message_info(sock, len, iter, lens, 0);
 }
 
+static void rds6_conn_message_info_retrans(struct socket *sock,
+					   unsigned int len,
+					   struct rds_info_iterator *iter,
+					   struct rds_info_lengths *lens)
+{
+	rds6_conn_message_info(sock, len, iter, lens, 0);
+}
+
 void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 			  struct rds_info_iterator *iter,
 			  struct rds_info_lengths *lens,
@@ -699,6 +734,34 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
 	return 1;
 }
 
+static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
+{
+	struct rds6_info_connection *cinfo6 = buffer;
+	struct rds_connection *conn = cp->cp_conn;
+
+	cinfo6->next_tx_seq = cp->cp_next_tx_seq;
+	cinfo6->next_rx_seq = cp->cp_next_rx_seq;
+	cinfo6->laddr = conn->c_laddr;
+	cinfo6->faddr = conn->c_faddr;
+	strncpy(cinfo6->transport, conn->c_trans->t_name,
+		sizeof(cinfo6->transport));
+	cinfo6->flags = 0;
+
+	rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
+			  SENDING);
+	/* XXX Future: return the state rather than these funky bits */
+	rds_conn_info_set(cinfo6->flags,
+			  atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
+			  CONNECTING);
+	rds_conn_info_set(cinfo6->flags,
+			  atomic_read(&cp->cp_state) == RDS_CONN_UP,
+			  CONNECTED);
+	/* Just return 1 as there is no error case. This is a helper function
+	 * for rds_walk_conn_path_info() and it wants a return value.
+	 */
+	return 1;
+}
+
 static void rds_conn_info(struct socket *sock, unsigned int len,
 			  struct rds_info_iterator *iter,
 			  struct rds_info_lengths *lens)
@@ -711,6 +774,18 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
 				sizeof(struct rds_info_connection));
 }
 
+static void rds6_conn_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8];
+
+	rds_walk_conn_path_info(sock, len, iter, lens,
+				rds6_conn_info_visitor,
+				buffer,
+				sizeof(struct rds6_info_connection));
+}
+
 int rds_conn_init(void)
 {
 	int ret;
@@ -732,6 +807,11 @@ int rds_conn_init(void)
 			       rds_conn_message_info_send);
 	rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
 			       rds_conn_message_info_retrans);
+	rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+	rds_info_register_func(RDS6_INFO_SEND_MESSAGES,
+			       rds6_conn_message_info_send);
+	rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES,
+			       rds6_conn_message_info_retrans);
 
 	return 0;
 }
@@ -750,6 +830,11 @@ void rds_conn_exit(void)
 				 rds_conn_message_info_send);
 	rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
 				 rds_conn_message_info_retrans);
+	rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+	rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES,
+				 rds6_conn_message_info_send);
+	rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES,
+				 rds6_conn_message_info_retrans);
 }
 
 /*
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 756225c5540f..63d95ea7cdff 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -321,6 +321,43 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
 	return 1;
 }
 
+/* IPv6 version of rds_ib_conn_info_visitor(). */
+static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
+				     void *buffer)
+{
+	struct rds6_info_rdma_connection *iinfo6 = buffer;
+	struct rds_ib_connection *ic;
+
+	/* We will only ever look at IB transports */
+	if (conn->c_trans != &rds_ib_transport)
+		return 0;
+
+	iinfo6->src_addr = conn->c_laddr;
+	iinfo6->dst_addr = conn->c_faddr;
+
+	memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
+	memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
+
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		struct rds_ib_device *rds_ibdev;
+		struct rdma_dev_addr *dev_addr;
+
+		ic = conn->c_transport_data;
+		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+		rdma_addr_get_sgid(dev_addr,
+				   (union ib_gid *)&iinfo6->src_gid);
+		rdma_addr_get_dgid(dev_addr,
+				   (union ib_gid *)&iinfo6->dst_gid);
+
+		rds_ibdev = ic->rds_ibdev;
+		iinfo6->max_send_wr = ic->i_send_ring.w_nr;
+		iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
+		iinfo6->max_send_sge = rds_ibdev->max_sge;
+		rds6_ib_get_mr_info(rds_ibdev, iinfo6);
+	}
+	return 1;
+}
+
 static void rds_ib_ic_info(struct socket *sock, unsigned int len,
 			   struct rds_info_iterator *iter,
 			   struct rds_info_lengths *lens)
@@ -333,6 +370,19 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
 				sizeof(struct rds_info_rdma_connection));
 }
 
+/* IPv6 version of rds_ib_ic_info(). */
+static void rds6_ib_ic_info(struct socket *sock, unsigned int len,
+			    struct rds_info_iterator *iter,
+			    struct rds_info_lengths *lens)
+{
+	u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8];
+
+	rds_for_each_conn_info(sock, len, iter, lens,
+			       rds6_ib_conn_info_visitor,
+			       buffer,
+			       sizeof(struct rds6_info_rdma_connection));
+}
+
 /*
  * Early RDS/IB was built to only bind to an address if there is an IPoIB
  * device with that address set.
@@ -441,6 +491,7 @@ void rds_ib_exit(void)
 	rds_ib_set_unloading();
 	synchronize_rcu();
 	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+	rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
 	rds_ib_unregister_client();
 	rds_ib_destroy_nodev_conns();
 	rds_ib_sysctl_exit();
@@ -502,6 +553,7 @@ int rds_ib_init(void)
 	rds_trans_register(&rds_ib_transport);
 
 	rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+	rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
 
 	goto out;
 
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 0ea4ab017a8c..f440ace584c8 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -113,6 +113,8 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
 					     int npages);
 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
 			struct rds_info_rdma_connection *iinfo);
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+			 struct rds6_info_rdma_connection *iinfo6);
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 		    struct rds_sock *rs, u32 *key_ret);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 0ec9df043dd0..e3c8bbbdb43f 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -180,6 +180,15 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
 	iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
 }
 
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+			 struct rds6_info_rdma_connection *iinfo6)
+{
+	struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
+
+	iinfo6->rdma_mr_max = pool_1m->max_items;
+	iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
+}
+
 struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
 {
 	struct rds_ib_mr *ibmr = NULL;
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 1402c21210b1..03cd8df54c26 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -792,3 +792,28 @@ void rds_inc_info_copy(struct rds_incoming *inc,
 
 	rds_info_copy(iter, &minfo, sizeof(minfo));
 }
+
+void rds6_inc_info_copy(struct rds_incoming *inc,
+			struct rds_info_iterator *iter,
+			struct in6_addr *saddr, struct in6_addr *daddr,
+			int flip)
+{
+	struct rds6_info_message minfo6;
+
+	minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+	minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
+
+	if (flip) {
+		minfo6.laddr = *daddr;
+		minfo6.faddr = *saddr;
+		minfo6.lport = inc->i_hdr.h_dport;
+		minfo6.fport = inc->i_hdr.h_sport;
+	} else {
+		minfo6.laddr = *saddr;
+		minfo6.faddr = *daddr;
+		minfo6.lport = inc->i_hdr.h_sport;
+		minfo6.fport = inc->i_hdr.h_dport;
+	}
+
+	rds_info_copy(iter, &minfo6, sizeof(minfo6));
+}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 890d0e1d8908..7028d6e51947 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -273,6 +273,48 @@ out:
 	spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
 }
 
+/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and
+ * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped
+ * address.
+ */
+static void rds6_tcp_tc_info(struct socket *sock, unsigned int len,
+			     struct rds_info_iterator *iter,
+			     struct rds_info_lengths *lens)
+{
+	struct rds6_info_tcp_socket tsinfo6;
+	struct rds_tcp_connection *tc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+	if (len / sizeof(tsinfo6) < rds6_tcp_tc_count)
+		goto out;
+
+	list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+		struct sock *sk = tc->t_sock->sk;
+		struct inet_sock *inet = inet_sk(sk);
+
+		tsinfo6.local_addr = sk->sk_v6_rcv_saddr;
+		tsinfo6.local_port = inet->inet_sport;
+		tsinfo6.peer_addr = sk->sk_v6_daddr;
+		tsinfo6.peer_port = inet->inet_dport;
+
+		tsinfo6.hdr_rem = tc->t_tinc_hdr_rem;
+		tsinfo6.data_rem = tc->t_tinc_data_rem;
+		tsinfo6.last_sent_nxt = tc->t_last_sent_nxt;
+		tsinfo6.last_expected_una = tc->t_last_expected_una;
+		tsinfo6.last_seen_una = tc->t_last_seen_una;
+
+		rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6));
+	}
+
+out:
+	lens->nr = rds6_tcp_tc_count;
+	lens->each = sizeof(tsinfo6);
+
+	spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+
 static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
 			       __u32 scope_id)
 {
@@ -628,6 +670,7 @@ static void rds_tcp_exit(void)
 	rds_tcp_set_unloading();
 	synchronize_rcu();
 	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+	rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
 	unregister_pernet_device(&rds_tcp_net_ops);
 	rds_tcp_destroy_conns();
 	rds_trans_unregister(&rds_tcp_transport);
@@ -659,6 +702,7 @@ static int rds_tcp_init(void)
 	rds_trans_register(&rds_tcp_transport);
 
 	rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+	rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
 
 	goto out;
 out_recv:
-- 
cgit v1.2.3


From 56ab8cdbc1438507d79085fcc7e511327d84aeb8 Mon Sep 17 00:00:00 2001
From: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Date: Thu, 17 May 2018 10:30:16 -0400
Subject: media: v4l: Add support for STD ioctls on subdev nodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no way to control the standard of subdevices which are part of
a media device. The ioctls which exists all target video devices
explicitly and the idea is that the video device should talk to the
subdevice. For subdevices part of a media graph this is not possible and
the standard must be controlled on the subdev device directly.

Add four new ioctls to be able to directly interact with subdevices and
control the video standard; VIDIOC_SUBDEV_ENUMSTD, VIDIOC_SUBDEV_G_STD,
VIDIOC_SUBDEV_S_STD and VIDIOC_SUBDEV_QUERYSTD.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/vidioc-enumstd.rst  | 11 +++++++----
 Documentation/media/uapi/v4l/vidioc-g-std.rst    | 14 ++++++++++----
 Documentation/media/uapi/v4l/vidioc-querystd.rst | 11 +++++++----
 drivers/media/v4l2-core/v4l2-subdev.c            | 22 ++++++++++++++++++++++
 include/uapi/linux/v4l2-subdev.h                 |  4 ++++
 5 files changed, 50 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/v4l/vidioc-enumstd.rst b/Documentation/media/uapi/v4l/vidioc-enumstd.rst
index b7fda29f46a1..2644a62acd4b 100644
--- a/Documentation/media/uapi/v4l/vidioc-enumstd.rst
+++ b/Documentation/media/uapi/v4l/vidioc-enumstd.rst
@@ -2,14 +2,14 @@
 
 .. _VIDIOC_ENUMSTD:
 
-********************
-ioctl VIDIOC_ENUMSTD
-********************
+*******************************************
+ioctl VIDIOC_ENUMSTD, VIDIOC_SUBDEV_ENUMSTD
+*******************************************
 
 Name
 ====
 
-VIDIOC_ENUMSTD - Enumerate supported video standards
+VIDIOC_ENUMSTD - VIDIOC_SUBDEV_ENUMSTD - Enumerate supported video standards
 
 
 Synopsis
@@ -18,6 +18,9 @@ Synopsis
 .. c:function:: int ioctl( int fd, VIDIOC_ENUMSTD, struct v4l2_standard *argp )
     :name: VIDIOC_ENUMSTD
 
+.. c:function:: int ioctl( int fd, VIDIOC_SUBDEV_ENUMSTD, struct v4l2_standard *argp )
+    :name: VIDIOC_SUBDEV_ENUMSTD
+
 
 Arguments
 =========
diff --git a/Documentation/media/uapi/v4l/vidioc-g-std.rst b/Documentation/media/uapi/v4l/vidioc-g-std.rst
index 90791ab51a53..8d94f0404df2 100644
--- a/Documentation/media/uapi/v4l/vidioc-g-std.rst
+++ b/Documentation/media/uapi/v4l/vidioc-g-std.rst
@@ -2,14 +2,14 @@
 
 .. _VIDIOC_G_STD:
 
-********************************
-ioctl VIDIOC_G_STD, VIDIOC_S_STD
-********************************
+**************************************************************************
+ioctl VIDIOC_G_STD, VIDIOC_S_STD, VIDIOC_SUBDEV_G_STD, VIDIOC_SUBDEV_S_STD
+**************************************************************************
 
 Name
 ====
 
-VIDIOC_G_STD - VIDIOC_S_STD - Query or select the video standard of the current input
+VIDIOC_G_STD - VIDIOC_S_STD - VIDIOC_SUBDEV_G_STD - VIDIOC_SUBDEV_S_STD - Query or select the video standard of the current input
 
 
 Synopsis
@@ -21,6 +21,12 @@ Synopsis
 .. c:function:: int ioctl( int fd, VIDIOC_S_STD, const v4l2_std_id *argp )
     :name: VIDIOC_S_STD
 
+.. c:function:: int ioctl( int fd, VIDIOC_SUBDEV_G_STD, v4l2_std_id *argp )
+    :name: VIDIOC_SUBDEV_G_STD
+
+.. c:function:: int ioctl( int fd, VIDIOC_SUBDEV_S_STD, const v4l2_std_id *argp )
+    :name: VIDIOC_SUBDEV_S_STD
+
 
 Arguments
 =========
diff --git a/Documentation/media/uapi/v4l/vidioc-querystd.rst b/Documentation/media/uapi/v4l/vidioc-querystd.rst
index cf40bca19b9f..a8385cc74818 100644
--- a/Documentation/media/uapi/v4l/vidioc-querystd.rst
+++ b/Documentation/media/uapi/v4l/vidioc-querystd.rst
@@ -2,14 +2,14 @@
 
 .. _VIDIOC_QUERYSTD:
 
-*********************
-ioctl VIDIOC_QUERYSTD
-*********************
+*********************************************
+ioctl VIDIOC_QUERYSTD, VIDIOC_SUBDEV_QUERYSTD
+*********************************************
 
 Name
 ====
 
-VIDIOC_QUERYSTD - Sense the video standard received by the current input
+VIDIOC_QUERYSTD - VIDIOC_SUBDEV_QUERYSTD - Sense the video standard received by the current input
 
 
 Synopsis
@@ -18,6 +18,9 @@ Synopsis
 .. c:function:: int ioctl( int fd, VIDIOC_QUERYSTD, v4l2_std_id *argp )
     :name: VIDIOC_QUERYSTD
 
+.. c:function:: int ioctl( int fd, VIDIOC_SUBDEV_QUERYSTD, v4l2_std_id *argp )
+    :name: VIDIOC_SUBDEV_QUERYSTD
+
 
 Arguments
 =========
diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c
index 6a7f7f75dfd7..2b63fa6b6fc9 100644
--- a/drivers/media/v4l2-core/v4l2-subdev.c
+++ b/drivers/media/v4l2-core/v4l2-subdev.c
@@ -494,6 +494,28 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg)
 
 	case VIDIOC_SUBDEV_S_DV_TIMINGS:
 		return v4l2_subdev_call(sd, video, s_dv_timings, arg);
+
+	case VIDIOC_SUBDEV_G_STD:
+		return v4l2_subdev_call(sd, video, g_std, arg);
+
+	case VIDIOC_SUBDEV_S_STD: {
+		v4l2_std_id *std = arg;
+
+		return v4l2_subdev_call(sd, video, s_std, *std);
+	}
+
+	case VIDIOC_SUBDEV_ENUMSTD: {
+		struct v4l2_standard *p = arg;
+		v4l2_std_id id;
+
+		if (v4l2_subdev_call(sd, video, g_tvnorms, &id))
+			return -EINVAL;
+
+		return v4l_video_std_enumstd(p, id);
+	}
+
+	case VIDIOC_SUBDEV_QUERYSTD:
+		return v4l2_subdev_call(sd, video, querystd, arg);
 #endif
 	default:
 		return v4l2_subdev_call(sd, core, ioctl, cmd, arg);
diff --git a/include/uapi/linux/v4l2-subdev.h b/include/uapi/linux/v4l2-subdev.h
index c95a53e6743c..03970ce30741 100644
--- a/include/uapi/linux/v4l2-subdev.h
+++ b/include/uapi/linux/v4l2-subdev.h
@@ -170,8 +170,12 @@ struct v4l2_subdev_selection {
 #define VIDIOC_SUBDEV_G_SELECTION		_IOWR('V', 61, struct v4l2_subdev_selection)
 #define VIDIOC_SUBDEV_S_SELECTION		_IOWR('V', 62, struct v4l2_subdev_selection)
 /* The following ioctls are identical to the ioctls in videodev2.h */
+#define VIDIOC_SUBDEV_G_STD			_IOR('V', 23, v4l2_std_id)
+#define VIDIOC_SUBDEV_S_STD			_IOW('V', 24, v4l2_std_id)
+#define VIDIOC_SUBDEV_ENUMSTD			_IOWR('V', 25, struct v4l2_standard)
 #define VIDIOC_SUBDEV_G_EDID			_IOWR('V', 40, struct v4l2_edid)
 #define VIDIOC_SUBDEV_S_EDID			_IOWR('V', 41, struct v4l2_edid)
+#define VIDIOC_SUBDEV_QUERYSTD			_IOR('V', 63, v4l2_std_id)
 #define VIDIOC_SUBDEV_S_DV_TIMINGS		_IOWR('V', 87, struct v4l2_dv_timings)
 #define VIDIOC_SUBDEV_G_DV_TIMINGS		_IOWR('V', 88, struct v4l2_dv_timings)
 #define VIDIOC_SUBDEV_ENUM_DV_TIMINGS		_IOWR('V', 98, struct v4l2_enum_dv_timings)
-- 
cgit v1.2.3


From b8f8c8eb408b36ad55dd41a616b3f51998880fb6 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 21 Jul 2018 15:48:47 +0200
Subject: net: phy: add GBit master / slave error detection

Certain PHY's have issues when operating in GBit slave mode and can
be forced to master mode. Examples are RTL8211C, also the Micrel PHY
driver has a DT setting to force master mode.
If two such chips are link partners the autonegotiation will fail.
Standard defines a self-clearing on read, latched-high bit to
indicate this error. Check this bit to inform the user.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 8 ++++++++
 include/uapi/linux/mii.h     | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index b9f5f40a7ac1..db1172db1e7c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1555,6 +1555,14 @@ int genphy_read_status(struct phy_device *phydev)
 			if (adv < 0)
 				return adv;
 
+			if (lpagb & LPA_1000MSFAIL) {
+				if (adv & CTL1000_ENABLE_MASTER)
+					phydev_err(phydev, "Master/Slave resolution failed, maybe conflicting manual settings?\n");
+				else
+					phydev_err(phydev, "Master/Slave resolution failed\n");
+				return -ENOLINK;
+			}
+
 			phydev->lp_advertising =
 				mii_stat1000_to_ethtool_lpa_t(lpagb);
 			common_adv_gb = lpagb & adv << 2;
diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h
index b5c2fdcf23fd..a506216591d6 100644
--- a/include/uapi/linux/mii.h
+++ b/include/uapi/linux/mii.h
@@ -136,6 +136,7 @@
 #define CTL1000_ENABLE_MASTER	0x1000
 
 /* 1000BASE-T Status register */
+#define LPA_1000MSFAIL		0x8000	/* Master/Slave resolution failure */
 #define LPA_1000LOCALRXOK	0x2000	/* Link partner local receiver status */
 #define LPA_1000REMRXOK		0x1000	/* Link partner remote receiver status */
 #define LPA_1000FULL		0x0800	/* Link partner 1000BASE-T full duplex */
-- 
cgit v1.2.3


From aea5f654e6b78a0c976f7a25950155932c77a53f Mon Sep 17 00:00:00 2001
From: Nishanth Devarajan <ndev2021@gmail.com>
Date: Mon, 23 Jul 2018 19:37:41 +0530
Subject: net/sched: add skbprio scheduler

Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes packets
according to their skb->priority field. Under congestion, already-enqueued lower
priority packets will be dropped to make space available for higher priority
packets. Skbprio was conceived as a solution for denial-of-service defenses that
need to route packets with different priorities as a means to overcome DoS
attacks.

v5
*Do not reference qdisc_dev(sch)->tx_queue_len for setting limit. Instead set
default sch->limit to 64.

v4
*Drop Documentation/networking/sch_skbprio.txt doc file to move it to tc man
page for Skbprio, in iproute2.

v3
*Drop max_limit parameter in struct skbprio_sched_data and instead use
sch->limit.

*Reference qdisc_dev(sch)->tx_queue_len only once, during initialisation for
qdisc (previously being referenced every time qdisc changes).

*Move qdisc's detailed description from in-code to Documentation/networking.

*When qdisc is saturated, enqueue incoming packet first before dequeueing
lowest priority packet in queue - improves usage of call stack registers.

*Introduce and use overlimit stat to keep track of number of dropped packets.

v2
*Use skb->priority field rather than DS field. Rename queueing discipline as
SKB Priority Queue (previously Gatekeeper Priority Queue).

*Queueing discipline is made classful to expose Skbprio's internal priority
queues.

Signed-off-by: Nishanth Devarajan <ndev2021@gmail.com>
Reviewed-by: Sachin Paryani <sachin.paryani@gmail.com>
Reviewed-by: Cody Doucette <doucette@bu.edu>
Reviewed-by: Michel Machado <michel@digirati.com.br>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  15 ++
 net/sched/Kconfig              |  13 ++
 net/sched/Makefile             |   1 +
 net/sched/sch_skbprio.c        | 320 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 349 insertions(+)
 create mode 100644 net/sched/sch_skbprio.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index d9cc9dc4f547..8975fd1a1421 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -124,6 +124,21 @@ struct tc_fifo_qopt {
 	__u32	limit;	/* Queue length: bytes for bfifo, packets for pfifo */
 };
 
+/* SKBPRIO section */
+
+/*
+ * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1).
+ * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able
+ * to map one to one the DS field of IPV4 and IPV6 headers.
+ * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY.
+ */
+
+#define SKBPRIO_MAX_PRIORITY 64
+
+struct tc_skbprio_qopt {
+	__u32	limit;		/* Queue length in packets. */
+};
+
 /* PRIO section */
 
 #define TCQ_PRIO_BANDS	16
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index bba71225adbd..e95741388311 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -251,6 +251,19 @@ config NET_SCH_MQPRIO
 
 	  If unsure, say N.
 
+config NET_SCH_SKBPRIO
+	tristate "SKB priority queue scheduler (SKBPRIO)"
+	help
+	  Say Y here if you want to use the SKB priority queue
+	  scheduler. This schedules packets according to skb->priority,
+	  which is useful for request packets in DoS mitigation systems such
+	  as Gatekeeper.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called sch_skbprio.
+
+	  If unsure, say N.
+
 config NET_SCH_CHOKE
 	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
 	help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 910ec7463a36..f0403f49edcb 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
 obj-$(CONFIG_NET_SCH_PLUG)	+= sch_plug.o
 obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
+obj-$(CONFIG_NET_SCH_SKBPRIO)	+= sch_skbprio.o
 obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
 obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
new file mode 100644
index 000000000000..52c0b6d8f1d7
--- /dev/null
+++ b/net/sched/sch_skbprio.c
@@ -0,0 +1,320 @@
+/*
+ * net/sched/sch_skbprio.c  SKB Priority Queue.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Nishanth Devarajan, <ndev2021@gmail.com>
+ *		Cody Doucette, <doucette@bu.edu>
+ *	        original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/inet_ecn.h>
+
+/*		SKB Priority Queue
+ *	=================================
+ *
+ * Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes
+ * packets according to their skb->priority field. Under congestion,
+ * Skbprio drops already-enqueued lower priority packets to make space
+ * available for higher priority packets; it was conceived as a solution
+ * for denial-of-service defenses that need to route packets with different
+ * priorities as a mean to overcome DoS attacks.
+ */
+
+struct skbprio_sched_data {
+	/* Queue state. */
+	struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY];
+	struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY];
+	u16 highest_prio;
+	u16 lowest_prio;
+};
+
+static u16 calc_new_high_prio(const struct skbprio_sched_data *q)
+{
+	int prio;
+
+	for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) {
+		if (!skb_queue_empty(&q->qdiscs[prio]))
+			return prio;
+	}
+
+	/* SKB queue is empty, return 0 (default highest priority setting). */
+	return 0;
+}
+
+static u16 calc_new_low_prio(const struct skbprio_sched_data *q)
+{
+	int prio;
+
+	for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) {
+		if (!skb_queue_empty(&q->qdiscs[prio]))
+			return prio;
+	}
+
+	/* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1
+	 * (default lowest priority setting).
+	 */
+	return SKBPRIO_MAX_PRIORITY - 1;
+}
+
+static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			  struct sk_buff **to_free)
+{
+	const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1;
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	struct sk_buff_head *qdisc;
+	struct sk_buff_head *lp_qdisc;
+	struct sk_buff *to_drop;
+	u16 prio, lp;
+
+	/* Obtain the priority of @skb. */
+	prio = min(skb->priority, max_priority);
+
+	qdisc = &q->qdiscs[prio];
+	if (sch->q.qlen < sch->limit) {
+		__skb_queue_tail(qdisc, skb);
+		qdisc_qstats_backlog_inc(sch, skb);
+		q->qstats[prio].backlog += qdisc_pkt_len(skb);
+
+		/* Check to update highest and lowest priorities. */
+		if (prio > q->highest_prio)
+			q->highest_prio = prio;
+
+		if (prio < q->lowest_prio)
+			q->lowest_prio = prio;
+
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+
+	/* If this packet has the lowest priority, drop it. */
+	lp = q->lowest_prio;
+	if (prio <= lp) {
+		q->qstats[prio].drops++;
+		q->qstats[prio].overlimits++;
+		return qdisc_drop(skb, sch, to_free);
+	}
+
+	__skb_queue_tail(qdisc, skb);
+	qdisc_qstats_backlog_inc(sch, skb);
+	q->qstats[prio].backlog += qdisc_pkt_len(skb);
+
+	/* Drop the packet at the tail of the lowest priority qdisc. */
+	lp_qdisc = &q->qdiscs[lp];
+	to_drop = __skb_dequeue_tail(lp_qdisc);
+	BUG_ON(!to_drop);
+	qdisc_qstats_backlog_dec(sch, to_drop);
+	qdisc_drop(to_drop, sch, to_free);
+
+	q->qstats[lp].backlog -= qdisc_pkt_len(to_drop);
+	q->qstats[lp].drops++;
+	q->qstats[lp].overlimits++;
+
+	/* Check to update highest and lowest priorities. */
+	if (skb_queue_empty(lp_qdisc)) {
+		if (q->lowest_prio == q->highest_prio) {
+			/* The incoming packet is the only packet in queue. */
+			BUG_ON(sch->q.qlen != 1);
+			q->lowest_prio = prio;
+			q->highest_prio = prio;
+		} else {
+			q->lowest_prio = calc_new_low_prio(q);
+		}
+	}
+
+	if (prio > q->highest_prio)
+		q->highest_prio = prio;
+
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *skbprio_dequeue(struct Qdisc *sch)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio];
+	struct sk_buff *skb = __skb_dequeue(hpq);
+
+	if (unlikely(!skb))
+		return NULL;
+
+	sch->q.qlen--;
+	qdisc_qstats_backlog_dec(sch, skb);
+	qdisc_bstats_update(sch, skb);
+
+	q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb);
+
+	/* Update highest priority field. */
+	if (skb_queue_empty(hpq)) {
+		if (q->lowest_prio == q->highest_prio) {
+			BUG_ON(sch->q.qlen);
+			q->highest_prio = 0;
+			q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+		} else {
+			q->highest_prio = calc_new_high_prio(q);
+		}
+	}
+	return skb;
+}
+
+static int skbprio_change(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	struct tc_skbprio_qopt *ctl = nla_data(opt);
+
+	sch->limit = ctl->limit;
+	return 0;
+}
+
+static int skbprio_init(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	/* Initialise all queues, one for each possible priority. */
+	for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_head_init(&q->qdiscs[prio]);
+
+	memset(&q->qstats, 0, sizeof(q->qstats));
+	q->highest_prio = 0;
+	q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+	sch->limit = 64;
+	if (!opt)
+		return 0;
+
+	return skbprio_change(sch, opt, extack);
+}
+
+static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct tc_skbprio_qopt opt;
+
+	opt.limit = sch->limit;
+
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		return -1;
+
+	return skb->len;
+}
+
+static void skbprio_reset(struct Qdisc *sch)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	sch->qstats.backlog = 0;
+	sch->q.qlen = 0;
+
+	for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_purge(&q->qdiscs[prio]);
+
+	memset(&q->qstats, 0, sizeof(q->qstats));
+	q->highest_prio = 0;
+	q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+}
+
+static void skbprio_destroy(struct Qdisc *sch)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+		__skb_queue_purge(&q->qdiscs[prio]);
+}
+
+static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long skbprio_find(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				   struct gnet_dump *d)
+{
+	struct skbprio_sched_data *q = qdisc_priv(sch);
+	if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1],
+		q->qstats[cl - 1].qlen) < 0)
+		return -1;
+	return 0;
+}
+
+static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) {
+		if (arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, i + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops skbprio_class_ops = {
+	.leaf		=	skbprio_leaf,
+	.find		=	skbprio_find,
+	.dump		=	skbprio_dump_class,
+	.dump_stats	=	skbprio_dump_class_stats,
+	.walk		=	skbprio_walk,
+};
+
+static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = {
+	.cl_ops		=	&skbprio_class_ops,
+	.id		=	"skbprio",
+	.priv_size	=	sizeof(struct skbprio_sched_data),
+	.enqueue	=	skbprio_enqueue,
+	.dequeue	=	skbprio_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	skbprio_init,
+	.reset		=	skbprio_reset,
+	.change		=	skbprio_change,
+	.dump		=	skbprio_dump,
+	.destroy	=	skbprio_destroy,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init skbprio_module_init(void)
+{
+	return register_qdisc(&skbprio_qdisc_ops);
+}
+
+static void __exit skbprio_module_exit(void)
+{
+	unregister_qdisc(&skbprio_qdisc_ops);
+}
+
+module_init(skbprio_module_init)
+module_exit(skbprio_module_exit)
+
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 6cbc304f2f360f25cc8607817239d6f4a2fd3dc5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 10 May 2018 15:48:41 +0200
Subject: perf/x86/intel: Fix unwind errors from PEBS entries (mk-II)

Vince reported the perf_fuzzer giving various unwinder warnings and
Josh reported:

> Deja vu.  Most of these are related to perf PEBS, similar to the
> following issue:
>
>   b8000586c90b ("perf/x86/intel: Cure bogus unwind from PEBS entries")
>
> This is basically the ORC version of that.  setup_pebs_sample_data() is
> assembling a franken-pt_regs which ORC isn't happy about.  RIP is
> inconsistent with some of the other registers (like RSP and RBP).

And where the previous unwinder only needed BP,SP ORC also requires
IP. But we cannot spoof IP because then the sample will get displaced,
entirely negating the point of PEBS.

So cure the whole thing differently by doing the unwind early; this
does however require a means to communicate we did the unwind early.
We (ab)use an unused sample_type bit for this, which we set on events
that fill out the data->callchain before the normal
perf_prepare_sample().

Debugged-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Tested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Tested-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/core.c    |  3 +++
 arch/x86/events/intel/ds.c      | 25 +++++++++++--------------
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  2 ++
 kernel/events/core.c            |  6 ++++--
 5 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 707b2a96e516..86f0c15dcc2d 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2997,6 +2997,9 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		}
 		if (x86_pmu.pebs_aliases)
 			x86_pmu.pebs_aliases(event);
+
+		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+			event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
 	}
 
 	if (needs_branch_stack(event)) {
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8cf03f101938..8dbba77e0518 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1185,17 +1185,21 @@ static void setup_pebs_sample_data(struct perf_event *event,
 		data->data_src.val = val;
 	}
 
+	/*
+	 * We must however always use iregs for the unwinder to stay sane; the
+	 * record BP,SP,IP can point into thin air when the record is from a
+	 * previous PMI context or an (I)RET happend between the record and
+	 * PMI.
+	 */
+	if (sample_type & PERF_SAMPLE_CALLCHAIN)
+		data->callchain = perf_callchain(event, iregs);
+
 	/*
 	 * We use the interrupt regs as a base because the PEBS record does not
 	 * contain a full regs set, specifically it seems to lack segment
 	 * descriptors, which get used by things like user_mode().
 	 *
 	 * In the simple case fix up only the IP for PERF_SAMPLE_IP.
-	 *
-	 * We must however always use BP,SP from iregs for the unwinder to stay
-	 * sane; the record BP,SP can point into thin air when the record is
-	 * from a previous PMI context or an (I)RET happend between the record
-	 * and PMI.
 	 */
 	*regs = *iregs;
 
@@ -1214,15 +1218,8 @@ static void setup_pebs_sample_data(struct perf_event *event,
 		regs->si = pebs->si;
 		regs->di = pebs->di;
 
-		/*
-		 * Per the above; only set BP,SP if we don't need callchains.
-		 *
-		 * XXX: does this make sense?
-		 */
-		if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) {
-			regs->bp = pebs->bp;
-			regs->sp = pebs->sp;
-		}
+		regs->bp = pebs->bp;
+		regs->sp = pebs->sp;
 
 #ifndef CONFIG_X86_32
 		regs->r8 = pebs->r8;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1fa12887ec02..87f6db437e4a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1130,6 +1130,7 @@ extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 		   u32 max_stack, bool crosstask, bool add_mark);
+extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b8e288a1f740..eeb787b1c53c 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -143,6 +143,8 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 
 	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+
+	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63,
 };
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8f0434a9951a..cdb32cf8e33c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6343,7 +6343,7 @@ static u64 perf_virt_to_phys(u64 virt)
 
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
-static struct perf_callchain_entry *
+struct perf_callchain_entry *
 perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
 	bool kernel = !event->attr.exclude_callchain_kernel;
@@ -6382,7 +6382,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		int size = 1;
 
-		data->callchain = perf_callchain(event, regs);
+		if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+			data->callchain = perf_callchain(event, regs);
+
 		size += data->callchain->nr;
 
 		header->size += size * sizeof(u64);
-- 
cgit v1.2.3


From f48a534adbd321362c3199140008c9e2f2a81b3a Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Thu, 28 Jun 2018 07:43:26 -0400
Subject: media: uapi/linux/cec.h: add 5V events

Add two new events to signal when the 5V line goes high or low.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/uapi/linux/cec.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index 20fe091b7e96..097fcd812471 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -384,6 +384,8 @@ struct cec_log_addrs {
 #define CEC_EVENT_PIN_CEC_HIGH		4
 #define CEC_EVENT_PIN_HPD_LOW		5
 #define CEC_EVENT_PIN_HPD_HIGH		6
+#define CEC_EVENT_PIN_5V_LOW		7
+#define CEC_EVENT_PIN_5V_HIGH		8
 
 #define CEC_EVENT_FL_INITIAL_STATE	(1 << 0)
 #define CEC_EVENT_FL_DROPPED_EVENTS	(1 << 1)
-- 
cgit v1.2.3


From 30b914c8d825da7d4c651ade34667cef05e3ee27 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hansverk@cisco.com>
Date: Tue, 27 Feb 2018 07:24:09 -0500
Subject: media: add 'index' to struct media_v2_pad

The v2 pad structure never exposed the pad index, which made it impossible
to call the MEDIA_IOC_SETUP_LINK ioctl, which needs that information.

It is really trivial to just expose this information, so implement this.

Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/media-device.c |  1 +
 include/uapi/linux/media.h   | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c
index 47bb2254fbfd..047d38372a27 100644
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -331,6 +331,7 @@ static long media_device_get_topology(struct media_device *mdev, void *arg)
 		kpad.id = pad->graph_obj.id;
 		kpad.entity_id = pad->entity->graph_obj.id;
 		kpad.flags = pad->flags;
+		kpad.index = pad->index;
 
 		if (copy_to_user(upad, &kpad, sizeof(kpad)))
 			ret = -EFAULT;
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 86c7dcc9cba3..f6338bd57929 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -305,11 +305,21 @@ struct media_v2_interface {
 	};
 } __attribute__ ((packed));
 
+/*
+ * Appeared in 4.19.0.
+ *
+ * The media_version argument comes from the media_version field in
+ * struct media_device_info.
+ */
+#define MEDIA_V2_PAD_HAS_INDEX(media_version) \
+	((media_version) >= ((4 << 16) | (19 << 8) | 0))
+
 struct media_v2_pad {
 	__u32 id;
 	__u32 entity_id;
 	__u32 flags;
-	__u32 reserved[5];
+	__u32 index;
+	__u32 reserved[4];
 } __attribute__ ((packed));
 
 struct media_v2_link {
-- 
cgit v1.2.3


From 588f4ee7e6fc5c9a0fb07c7051cdd341949e0feb Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hansverk@cisco.com>
Date: Wed, 28 Feb 2018 05:41:11 -0500
Subject: media: add flags field to struct media_v2_entity

The v2 entity structure never exposed the entity flags, which made it
impossible to detect connector or default entities.

It is really trivial to just expose this information, so implement this.

Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/media-device.c |  1 +
 include/uapi/linux/media.h   | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c
index 047d38372a27..14959b19a342 100644
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -266,6 +266,7 @@ static long media_device_get_topology(struct media_device *mdev, void *arg)
 		memset(&kentity, 0, sizeof(kentity));
 		kentity.id = entity->graph_obj.id;
 		kentity.function = entity->function;
+		kentity.flags = entity->flags;
 		strlcpy(kentity.name, entity->name,
 			sizeof(kentity.name));
 
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index f6338bd57929..ebd2cda67833 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -280,11 +280,21 @@ struct media_links_enum {
  * MC next gen API definitions
  */
 
+/*
+ * Appeared in 4.19.0.
+ *
+ * The media_version argument comes from the media_version field in
+ * struct media_device_info.
+ */
+#define MEDIA_V2_ENTITY_HAS_FLAGS(media_version) \
+	((media_version) >= ((4 << 16) | (19 << 8) | 0))
+
 struct media_v2_entity {
 	__u32 id;
 	char name[64];
 	__u32 function;		/* Main function of the entity */
-	__u32 reserved[6];
+	__u32 flags;
+	__u32 reserved[5];
 } __attribute__ ((packed));
 
 /* Should match the specific fields at media_intf_devnode */
-- 
cgit v1.2.3


From d272bc92c4a4fcec4102c011eaa85630bd2d8d38 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hansverk@cisco.com>
Date: Thu, 28 Jun 2018 08:56:02 -0400
Subject: media: rename MEDIA_ENT_F_DTV_DECODER to MEDIA_ENT_F_DV_DECODER

The use of 'DTV' is very confusing since it normally refers to Digital
TV e.g. DVB etc.

Instead use 'DV' (Digital Video), which nicely corresponds to the
DV Timings API used to configure such receivers and transmitters.

We keep an alias to avoid breaking userspace applications.

Since this alias is only available if __KERNEL__ is *not* defined
(i.e. it is only available for userspace, not kernelspace), any
drivers that use it also have to be converted to the new define.
These drivers are adv7604, adv7842 and tda1997x.

Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/mediactl/media-types.rst | 2 +-
 drivers/media/i2c/adv7604.c                       | 1 +
 drivers/media/i2c/adv7842.c                       | 1 +
 drivers/media/i2c/tda1997x.c                      | 2 +-
 include/uapi/linux/media.h                        | 4 +++-
 5 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/mediactl/media-types.rst b/Documentation/media/uapi/mediactl/media-types.rst
index 96910cf2eaaa..c11b0c7e890b 100644
--- a/Documentation/media/uapi/mediactl/media-types.rst
+++ b/Documentation/media/uapi/mediactl/media-types.rst
@@ -200,7 +200,7 @@ Types and flags used to represent the media graph elements
          MIPI CSI-2, etc.), and outputs them on its source pad to an output
          video bus of another type (eDP, MIPI CSI-2, parallel, etc.).
 
-    *  -  ``MEDIA_ENT_F_DTV_DECODER``
+    *  -  ``MEDIA_ENT_F_DV_DECODER``
        -  Digital video decoder. The basic function of the video decoder is
 	  to accept digital video from a wide variety of sources
 	  and output it in some digital video standard, with appropriate
diff --git a/drivers/media/i2c/adv7604.c b/drivers/media/i2c/adv7604.c
index 1a3b2c04d9f9..668be2bca57a 100644
--- a/drivers/media/i2c/adv7604.c
+++ b/drivers/media/i2c/adv7604.c
@@ -3499,6 +3499,7 @@ static int adv76xx_probe(struct i2c_client *client,
 	for (i = 0; i < state->source_pad; ++i)
 		state->pads[i].flags = MEDIA_PAD_FL_SINK;
 	state->pads[state->source_pad].flags = MEDIA_PAD_FL_SOURCE;
+	sd->entity.function = MEDIA_ENT_F_DV_DECODER;
 
 	err = media_entity_pads_init(&sd->entity, state->source_pad + 1,
 				state->pads);
diff --git a/drivers/media/i2c/adv7842.c b/drivers/media/i2c/adv7842.c
index fddac32e5051..99d781343fb1 100644
--- a/drivers/media/i2c/adv7842.c
+++ b/drivers/media/i2c/adv7842.c
@@ -3541,6 +3541,7 @@ static int adv7842_probe(struct i2c_client *client,
 	INIT_DELAYED_WORK(&state->delayed_work_enable_hotplug,
 			adv7842_delayed_work_enable_hotplug);
 
+	sd->entity.function = MEDIA_ENT_F_DV_DECODER;
 	state->pad.flags = MEDIA_PAD_FL_SOURCE;
 	err = media_entity_pads_init(&sd->entity, 1, &state->pad);
 	if (err)
diff --git a/drivers/media/i2c/tda1997x.c b/drivers/media/i2c/tda1997x.c
index 039a92c3294a..d114ac5243ec 100644
--- a/drivers/media/i2c/tda1997x.c
+++ b/drivers/media/i2c/tda1997x.c
@@ -2570,7 +2570,7 @@ static int tda1997x_probe(struct i2c_client *client,
 		 id->name, i2c_adapter_id(client->adapter),
 		 client->addr);
 	sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE | V4L2_SUBDEV_FL_HAS_EVENTS;
-	sd->entity.function = MEDIA_ENT_F_DTV_DECODER;
+	sd->entity.function = MEDIA_ENT_F_DV_DECODER;
 	sd->entity.ops = &tda1997x_media_ops;
 
 	/* set allowed mbus modes based on chip, bus-type, and bus-width */
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index ebd2cda67833..99f5e0978ebb 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -93,7 +93,7 @@ struct media_device_info {
  * Video decoder functions
  */
 #define MEDIA_ENT_F_ATV_DECODER			(MEDIA_ENT_F_OLD_SUBDEV_BASE + 4)
-#define MEDIA_ENT_F_DTV_DECODER			(MEDIA_ENT_F_BASE + 0x6001)
+#define MEDIA_ENT_F_DV_DECODER			(MEDIA_ENT_F_BASE + 0x6001)
 
 /*
  * Digital TV, analog TV, radio and/or software defined radio tuner functions.
@@ -400,6 +400,8 @@ struct media_v2_topology {
 #define MEDIA_ENT_T_V4L2_SUBDEV_DECODER		MEDIA_ENT_F_ATV_DECODER
 #define MEDIA_ENT_T_V4L2_SUBDEV_TUNER		MEDIA_ENT_F_TUNER
 
+#define MEDIA_ENT_F_DTV_DECODER			MEDIA_ENT_F_DV_DECODER
+
 /*
  * There is still no ALSA support in the media controller. These
  * defines should not have been added and we leave them here only
-- 
cgit v1.2.3


From 7c8362c4b9eb7e4e9cbebc71ab529dc455c21b6b Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Mon, 18 Jun 2018 05:07:43 -0400
Subject: media: media.h: add MEDIA_ENT_F_DV_ENCODER

Add a new function for digital video encoders such as HDMI transmitters.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/mediactl/media-types.rst | 7 +++++++
 include/uapi/linux/media.h                        | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/mediactl/media-types.rst b/Documentation/media/uapi/mediactl/media-types.rst
index c11b0c7e890b..e90d4d0a7f8b 100644
--- a/Documentation/media/uapi/mediactl/media-types.rst
+++ b/Documentation/media/uapi/mediactl/media-types.rst
@@ -206,6 +206,13 @@ Types and flags used to represent the media graph elements
 	  and output it in some digital video standard, with appropriate
 	  timing signals.
 
+    *  -  ``MEDIA_ENT_F_DV_ENCODER``
+       -  Digital video encoder. The basic function of the video encoder is
+	  to accept digital video from some digital video standard with
+	  appropriate timing signals (usually a parallel video bus with sync
+	  signals) and output this to a digital video output connector such
+	  as HDMI or DisplayPort.
+
 ..  tabularcolumns:: |p{5.5cm}|p{12.0cm}|
 
 .. _media-entity-flag:
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 99f5e0978ebb..6f594fa238c2 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -90,10 +90,11 @@ struct media_device_info {
 #define MEDIA_ENT_F_LENS			(MEDIA_ENT_F_OLD_SUBDEV_BASE + 3)
 
 /*
- * Video decoder functions
+ * Video decoder/encoder functions
  */
 #define MEDIA_ENT_F_ATV_DECODER			(MEDIA_ENT_F_OLD_SUBDEV_BASE + 4)
 #define MEDIA_ENT_F_DV_DECODER			(MEDIA_ENT_F_BASE + 0x6001)
+#define MEDIA_ENT_F_DV_ENCODER			(MEDIA_ENT_F_BASE + 0x6002)
 
 /*
  * Digital TV, analog TV, radio and/or software defined radio tuner functions.
-- 
cgit v1.2.3


From f2399f7522ac2a255f6502537609a51974be84be Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hansverk@cisco.com>
Date: Thu, 28 Jun 2018 09:03:42 -0400
Subject: media: media.h: reorder video en/decoder functions

Keep the function defines in numerical order: 0x6000 comes after
0x2000, so move it back.

Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/uapi/linux/media.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 6f594fa238c2..76d9bd64c116 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -89,13 +89,6 @@ struct media_device_info {
 #define MEDIA_ENT_F_FLASH			(MEDIA_ENT_F_OLD_SUBDEV_BASE + 2)
 #define MEDIA_ENT_F_LENS			(MEDIA_ENT_F_OLD_SUBDEV_BASE + 3)
 
-/*
- * Video decoder/encoder functions
- */
-#define MEDIA_ENT_F_ATV_DECODER			(MEDIA_ENT_F_OLD_SUBDEV_BASE + 4)
-#define MEDIA_ENT_F_DV_DECODER			(MEDIA_ENT_F_BASE + 0x6001)
-#define MEDIA_ENT_F_DV_ENCODER			(MEDIA_ENT_F_BASE + 0x6002)
-
 /*
  * Digital TV, analog TV, radio and/or software defined radio tuner functions.
  *
@@ -140,6 +133,13 @@ struct media_device_info {
 #define MEDIA_ENT_F_VID_MUX			(MEDIA_ENT_F_BASE + 0x5001)
 #define MEDIA_ENT_F_VID_IF_BRIDGE		(MEDIA_ENT_F_BASE + 0x5002)
 
+/*
+ * Video decoder/encoder functions
+ */
+#define MEDIA_ENT_F_ATV_DECODER			(MEDIA_ENT_F_OLD_SUBDEV_BASE + 4)
+#define MEDIA_ENT_F_DV_DECODER			(MEDIA_ENT_F_BASE + 0x6001)
+#define MEDIA_ENT_F_DV_ENCODER			(MEDIA_ENT_F_BASE + 0x6002)
+
 /* Entity flags */
 #define MEDIA_ENT_FL_DEFAULT			(1 << 0)
 #define MEDIA_ENT_FL_CONNECTOR			(1 << 1)
-- 
cgit v1.2.3


From 45841a977391f24b9bf713548c588d148a576d22 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Wed, 4 Jul 2018 10:13:47 -0400
Subject: media: media.h: add encoder/decoder functions for codecs

Add MEDIA_ENT_F_PROC_VIDEO_EN/DECODER to be used for the encoder
and decoder entities of codec hardware.

[mchehab+samsung@kernel.org: split description on two senteces by adding dots]
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/mediactl/media-types.rst | 11 +++++++++++
 include/uapi/linux/media.h                        |  2 ++
 2 files changed, 13 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/mediactl/media-types.rst b/Documentation/media/uapi/mediactl/media-types.rst
index e90d4d0a7f8b..7b17acc049cf 100644
--- a/Documentation/media/uapi/mediactl/media-types.rst
+++ b/Documentation/media/uapi/mediactl/media-types.rst
@@ -37,6 +37,8 @@ Types and flags used to represent the media graph elements
 .. _MEDIA-ENT-F-PROC-VIDEO-LUT:
 .. _MEDIA-ENT-F-PROC-VIDEO-SCALER:
 .. _MEDIA-ENT-F-PROC-VIDEO-STATISTICS:
+.. _MEDIA-ENT-F-PROC-VIDEO-ENCODER:
+.. _MEDIA-ENT-F-PROC-VIDEO-DECODER:
 .. _MEDIA-ENT-F-VID-MUX:
 .. _MEDIA-ENT-F-VID-IF-BRIDGE:
 .. _MEDIA-ENT-F-DTV-DECODER:
@@ -188,6 +190,15 @@ Types and flags used to represent the media graph elements
 	  received on its sink pad and outputs the statistics data on
 	  its source pad.
 
+    *  -  ``MEDIA_ENT_F_PROC_VIDEO_ENCODER``
+       -  Video (MPEG, HEVC, VPx, etc.) encoder. An entity capable of
+          compressing video frames. Must have one sink pad and one source pad.
+
+    *  -  ``MEDIA_ENT_F_PROC_VIDEO_DECODER``
+       -  Video (MPEG, HEVC, VPx, etc.) decoder. An entity capable of
+          decompressing a compressed video stream into uncompressed video
+	  frames. Must have one sink pad and one source pad.
+
     *  -  ``MEDIA_ENT_F_VID_MUX``
        - Video multiplexer. An entity capable of multiplexing must have at
          least two sink pads and one source pad, and must pass the video
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 76d9bd64c116..82ec9f132a53 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -126,6 +126,8 @@ struct media_device_info {
 #define MEDIA_ENT_F_PROC_VIDEO_LUT		(MEDIA_ENT_F_BASE + 0x4004)
 #define MEDIA_ENT_F_PROC_VIDEO_SCALER		(MEDIA_ENT_F_BASE + 0x4005)
 #define MEDIA_ENT_F_PROC_VIDEO_STATISTICS	(MEDIA_ENT_F_BASE + 0x4006)
+#define MEDIA_ENT_F_PROC_VIDEO_ENCODER		(MEDIA_ENT_F_BASE + 0x4007)
+#define MEDIA_ENT_F_PROC_VIDEO_DECODER		(MEDIA_ENT_F_BASE + 0x4008)
 
 /*
  * Switch and bridge entity functions
-- 
cgit v1.2.3


From 62c3fce04154777e6a3ce3a27f123b645d36dcff Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Wed, 4 Jul 2018 10:13:47 -0400
Subject: media: videodev.h: add PIX_FMT_FWHT for use with vicodec

Add a new pixelformat for the vicodec software codec using the
Fast Walsh Hadamard Transform.

Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/pixfmt-compressed.rst | 7 +++++++
 drivers/media/v4l2-core/v4l2-ioctl.c               | 1 +
 include/uapi/linux/videodev2.h                     | 1 +
 3 files changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/v4l/pixfmt-compressed.rst b/Documentation/media/uapi/v4l/pixfmt-compressed.rst
index abec03937bb3..d382e7a5c38e 100644
--- a/Documentation/media/uapi/v4l/pixfmt-compressed.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-compressed.rst
@@ -95,3 +95,10 @@ Compressed Formats
       - ``V4L2_PIX_FMT_HEVC``
       - 'HEVC'
       - HEVC/H.265 video elementary stream.
+    * .. _V4L2-PIX-FMT-FWHT:
+
+      - ``V4L2_PIX_FMT_FWHT``
+      - 'FWHT'
+      - Video elementary stream using a codec based on the Fast Walsh Hadamard
+        Transform. This codec is implemented by the vicodec ('Virtual Codec')
+	driver. See the vicodec-codec.h header for more details.
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 01670567641a..26d9702069fd 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1311,6 +1311,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 		case V4L2_PIX_FMT_VP8:		descr = "VP8"; break;
 		case V4L2_PIX_FMT_VP9:		descr = "VP9"; break;
 		case V4L2_PIX_FMT_HEVC:		descr = "HEVC"; break; /* aka H.265 */
+		case V4L2_PIX_FMT_FWHT:		descr = "FWHT"; break; /* used in vicodec */
 		case V4L2_PIX_FMT_CPIA1:	descr = "GSPCA CPiA YUV"; break;
 		case V4L2_PIX_FMT_WNVA:		descr = "WNVA"; break;
 		case V4L2_PIX_FMT_SN9C10X:	descr = "GSPCA SN9C10X"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 600877be5c22..3ea8097c2470 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -636,6 +636,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_VP8      v4l2_fourcc('V', 'P', '8', '0') /* VP8 */
 #define V4L2_PIX_FMT_VP9      v4l2_fourcc('V', 'P', '9', '0') /* VP9 */
 #define V4L2_PIX_FMT_HEVC     v4l2_fourcc('H', 'E', 'V', 'C') /* HEVC aka H.265 */
+#define V4L2_PIX_FMT_FWHT     v4l2_fourcc('F', 'W', 'H', 'T') /* Fast Walsh Hadamard Transform (vicodec) */
 
 /*  Vendor-specific formats   */
 #define V4L2_PIX_FMT_CPIA1    v4l2_fourcc('C', 'P', 'I', 'A') /* cpia1 YUV */
-- 
cgit v1.2.3


From 603cc1498455cf57f5ca4483b600efb37ea2c56c Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 25 Jul 2018 16:35:32 +0200
Subject: net/smc: provide fallback reason code

Remember the fallback reason code and the peer diagnosis code for
smc sockets, and provide them in smc_diag.c to the netlink interface.
And add more detailed reason codes.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/smc_diag.h |  6 +++++
 net/smc/af_smc.c              | 52 +++++++++++++++++++++++++------------------
 net/smc/smc.h                 |  2 ++
 net/smc/smc_clc.c             |  6 ++++-
 net/smc/smc_clc.h             | 18 ++++++++++-----
 net/smc/smc_diag.c            |  6 +++++
 6 files changed, 61 insertions(+), 29 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h
index 48ae3ee22b2d..ac9e8c96d9bd 100644
--- a/include/uapi/linux/smc_diag.h
+++ b/include/uapi/linux/smc_diag.h
@@ -43,6 +43,7 @@ enum {
 	SMC_DIAG_LGRINFO,
 	SMC_DIAG_SHUTDOWN,
 	SMC_DIAG_DMBINFO,
+	SMC_DIAG_FALLBACK,
 	__SMC_DIAG_MAX,
 };
 
@@ -92,6 +93,11 @@ struct smc_diag_lgrinfo {
 	__u8				role;
 };
 
+struct smc_diag_fallback {
+	__u32 reason;
+	__u32 peer_diagnosis;
+};
+
 struct smcd_diag_dmbinfo {		/* SMC-D Socket internals */
 	__u32 linkid;			/* Link identifier */
 	__u64 peer_gid;			/* Peer GID */
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index b81797103260..fce7e4751151 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -344,17 +344,17 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
 
 	rc = smc_ib_modify_qp_rts(link);
 	if (rc)
-		return SMC_CLC_DECL_INTERR;
+		return SMC_CLC_DECL_ERR_RDYLNK;
 
 	smc_wr_remember_qp_attr(link);
 
 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
-		return SMC_CLC_DECL_INTERR;
+		return SMC_CLC_DECL_ERR_REGRMB;
 
 	/* send CONFIRM LINK response over RoCE fabric */
 	rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
 	if (rc < 0)
-		return SMC_CLC_DECL_TCL;
+		return SMC_CLC_DECL_TIMEOUT_CL;
 
 	/* receive ADD LINK request from server over RoCE fabric */
 	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
@@ -372,7 +372,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
 				   link->smcibdev->mac[link->ibport - 1],
 				   link->gid, SMC_LLC_RESP);
 	if (rc < 0)
-		return SMC_CLC_DECL_TCL;
+		return SMC_CLC_DECL_TIMEOUT_AL;
 
 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
 
@@ -424,9 +424,10 @@ static void smc_link_save_peer_info(struct smc_link *link,
 }
 
 /* fall back during connect */
-static int smc_connect_fallback(struct smc_sock *smc)
+static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
 {
 	smc->use_fallback = true;
+	smc->fallback_rsn = reason_code;
 	smc_copy_sock_settings_to_clc(smc);
 	if (smc->sk.sk_state == SMC_INIT)
 		smc->sk.sk_state = SMC_ACTIVE;
@@ -443,7 +444,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
 			sock_put(&smc->sk); /* passive closing */
 		return reason_code;
 	}
-	if (reason_code != SMC_CLC_DECL_REPLY) {
+	if (reason_code != SMC_CLC_DECL_PEERDECL) {
 		rc = smc_clc_send_decline(smc, reason_code);
 		if (rc < 0) {
 			if (smc->sk.sk_state == SMC_INIT)
@@ -451,7 +452,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
 			return rc;
 		}
 	}
-	return smc_connect_fallback(smc);
+	return smc_connect_fallback(smc, reason_code);
 }
 
 /* abort connecting */
@@ -568,7 +569,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
 		smc_link_save_peer_info(link, aclc);
 
 	if (smc_rmb_rtoken_handling(&smc->conn, aclc))
-		return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+		return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
 					 local_contact);
 
 	smc_close_init(smc);
@@ -576,12 +577,12 @@ static int smc_connect_rdma(struct smc_sock *smc,
 
 	if (local_contact == SMC_FIRST_CONTACT) {
 		if (smc_ib_ready_link(link))
-			return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
 						 local_contact);
 	} else {
 		if (!smc->conn.rmb_desc->reused &&
 		    smc_reg_rmb(link, smc->conn.rmb_desc, true))
-			return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
 						 local_contact);
 	}
 	smc_rmb_sync_sg_for_device(&smc->conn);
@@ -659,11 +660,11 @@ static int __smc_connect(struct smc_sock *smc)
 	sock_hold(&smc->sk); /* sock put in passive closing */
 
 	if (smc->use_fallback)
-		return smc_connect_fallback(smc);
+		return smc_connect_fallback(smc, smc->fallback_rsn);
 
 	/* if peer has not signalled SMC-capability, fall back */
 	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
-		return smc_connect_fallback(smc);
+		return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
 
 	/* IPSec connections opt out of SMC-R optimizations */
 	if (using_ipsec(smc))
@@ -693,7 +694,7 @@ static int __smc_connect(struct smc_sock *smc)
 
 	/* if neither ISM nor RDMA are supported, fallback */
 	if (!rdma_supported && !ism_supported)
-		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
+		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
 
 	/* perform CLC handshake */
 	rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev);
@@ -708,7 +709,7 @@ static int __smc_connect(struct smc_sock *smc)
 	else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
 		rc = smc_connect_ism(smc, &aclc, ismdev);
 	else
-		rc = SMC_CLC_DECL_CNFERR;
+		rc = SMC_CLC_DECL_MODEUNSUPP;
 	if (rc) {
 		smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
 		return smc_connect_decline_fallback(smc, rc);
@@ -946,12 +947,12 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
 	link = &lgr->lnk[SMC_SINGLE_LINK];
 
 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
-		return SMC_CLC_DECL_INTERR;
+		return SMC_CLC_DECL_ERR_REGRMB;
 
 	/* send CONFIRM LINK request to client over the RoCE fabric */
 	rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
 	if (rc < 0)
-		return SMC_CLC_DECL_TCL;
+		return SMC_CLC_DECL_TIMEOUT_CL;
 
 	/* receive CONFIRM LINK response from client over the RoCE fabric */
 	rest = wait_for_completion_interruptible_timeout(
@@ -973,7 +974,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
 				   link->smcibdev->mac[link->ibport - 1],
 				   link->gid, SMC_LLC_REQ);
 	if (rc < 0)
-		return SMC_CLC_DECL_TCL;
+		return SMC_CLC_DECL_TIMEOUT_AL;
 
 	/* receive ADD LINK response from client over the RoCE fabric */
 	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
@@ -1048,7 +1049,8 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
 	}
 	smc_conn_free(&new_smc->conn);
 	new_smc->use_fallback = true;
-	if (reason_code && reason_code != SMC_CLC_DECL_REPLY) {
+	new_smc->fallback_rsn = reason_code;
+	if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
 		if (smc_clc_send_decline(new_smc, reason_code) < 0) {
 			smc_listen_out_err(new_smc);
 			return;
@@ -1139,7 +1141,7 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
 	if (local_contact != SMC_FIRST_CONTACT) {
 		if (!new_smc->conn.rmb_desc->reused) {
 			if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
-				return SMC_CLC_DECL_INTERR;
+				return SMC_CLC_DECL_ERR_REGRMB;
 		}
 	}
 	smc_rmb_sync_sg_for_device(&new_smc->conn);
@@ -1159,13 +1161,13 @@ static void smc_listen_rdma_finish(struct smc_sock *new_smc,
 		smc_link_save_peer_info(link, cclc);
 
 	if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
-		reason_code = SMC_CLC_DECL_INTERR;
+		reason_code = SMC_CLC_DECL_ERR_RTOK;
 		goto decline;
 	}
 
 	if (local_contact == SMC_FIRST_CONTACT) {
 		if (smc_ib_ready_link(link)) {
-			reason_code = SMC_CLC_DECL_INTERR;
+			reason_code = SMC_CLC_DECL_ERR_RDYLNK;
 			goto decline;
 		}
 		/* QP confirmation over RoCE fabric */
@@ -1206,6 +1208,7 @@ static void smc_listen_work(struct work_struct *work)
 	/* check if peer is smc capable */
 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
 		new_smc->use_fallback = true;
+		new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
 		smc_listen_out_connected(new_smc);
 		return;
 	}
@@ -1250,7 +1253,8 @@ static void smc_listen_work(struct work_struct *work)
 	     smc_listen_rdma_reg(new_smc, local_contact))) {
 		/* SMC not supported, decline */
 		mutex_unlock(&smc_create_lgr_pending);
-		smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact);
+		smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP,
+				   local_contact);
 		return;
 	}
 
@@ -1297,6 +1301,7 @@ static void smc_tcp_listen_work(struct work_struct *work)
 
 		new_smc->listen_smc = lsmc;
 		new_smc->use_fallback = lsmc->use_fallback;
+		new_smc->fallback_rsn = lsmc->fallback_rsn;
 		sock_hold(lsk); /* sock_put in smc_listen_work */
 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
 		smc_copy_sock_settings_to_smc(new_smc);
@@ -1451,6 +1456,7 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	if (msg->msg_flags & MSG_FASTOPEN) {
 		if (sk->sk_state == SMC_INIT) {
 			smc->use_fallback = true;
+			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
 		} else {
 			rc = -EINVAL;
 			goto out;
@@ -1648,6 +1654,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 		/* option not supported by SMC */
 		if (sk->sk_state == SMC_INIT) {
 			smc->use_fallback = true;
+			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
 		} else {
 			if (!smc->use_fallback)
 				rc = -EINVAL;
@@ -1885,6 +1892,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
 	/* create internal TCP socket for CLC handshake and fallback */
 	smc = smc_sk(sk);
 	smc->use_fallback = false; /* assume rdma capability first */
+	smc->fallback_rsn = 0;
 	rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
 			      &smc->clcsock);
 	if (rc) {
diff --git a/net/smc/smc.h b/net/smc/smc.h
index be20acd7b5ab..08786ace6010 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -208,6 +208,8 @@ struct smc_sock {				/* smc sock container */
 	struct list_head	accept_q;	/* sockets to be accepted */
 	spinlock_t		accept_q_lock;	/* protects accept_q */
 	bool			use_fallback;	/* fallback to tcp */
+	int			fallback_rsn;	/* reason for fallback */
+	u32			peer_diagnosis; /* decline reason from peer */
 	int			sockopt_defer_accept;
 						/* sockopt TCP_DEFER_ACCEPT
 						 * value
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 78d74938a9d9..83aba9ade060 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -334,7 +334,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 		goto out;
 	}
 	if (clcm->type == SMC_CLC_DECLINE) {
-		reason_code = SMC_CLC_DECL_REPLY;
+		struct smc_clc_msg_decline *dclc;
+
+		dclc = (struct smc_clc_msg_decline *)clcm;
+		reason_code = SMC_CLC_DECL_PEERDECL;
+		smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
 		if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
 			smc->conn.lgr->sync_err = 1;
 			smc_lgr_terminate(smc->conn.lgr);
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 6bdc63352d6a..18da89b681c2 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -28,15 +28,21 @@
 #define SMC_TYPE_B		3		/* SMC-R and SMC-D	      */
 #define CLC_WAIT_TIME		(6 * HZ)	/* max. wait time on clcsock  */
 #define SMC_CLC_DECL_MEM	0x01010000  /* insufficient memory resources  */
-#define SMC_CLC_DECL_TIMEOUT	0x02000000  /* timeout                        */
+#define SMC_CLC_DECL_TIMEOUT_CL	0x02010000  /* timeout w4 QP confirm link     */
+#define SMC_CLC_DECL_TIMEOUT_AL	0x02020000  /* timeout w4 QP add link	      */
 #define SMC_CLC_DECL_CNFERR	0x03000000  /* configuration error            */
-#define SMC_CLC_DECL_IPSEC	0x03030000  /* IPsec usage                    */
+#define SMC_CLC_DECL_PEERNOSMC	0x03010000  /* peer did not indicate SMC      */
+#define SMC_CLC_DECL_IPSEC	0x03020000  /* IPsec usage		      */
+#define SMC_CLC_DECL_NOSMCDEV	0x03030000  /* no SMC device found	      */
+#define SMC_CLC_DECL_MODEUNSUPP	0x03040000  /* smc modes do not match (R or D)*/
+#define SMC_CLC_DECL_RMBE_EC	0x03050000  /* peer has eyecatcher in RMBE    */
+#define SMC_CLC_DECL_OPTUNSUPP	0x03060000  /* fastopen sockopt not supported */
 #define SMC_CLC_DECL_SYNCERR	0x04000000  /* synchronization error          */
-#define SMC_CLC_DECL_REPLY	0x06000000  /* reply to a received decline    */
+#define SMC_CLC_DECL_PEERDECL	0x05000000  /* peer declined during handshake */
 #define SMC_CLC_DECL_INTERR	0x99990000  /* internal error                 */
-#define SMC_CLC_DECL_TCL	0x02040000  /* timeout w4 QP confirm          */
-#define SMC_CLC_DECL_SEND	0x07000000  /* sending problem                */
-#define SMC_CLC_DECL_RMBE_EC	0x08000000  /* peer has eyecatcher in RMBE    */
+#define SMC_CLC_DECL_ERR_RTOK	0x99990001  /*	 rtoken handling failed       */
+#define SMC_CLC_DECL_ERR_RDYLNK	0x99990002  /*	 ib ready link failed	      */
+#define SMC_CLC_DECL_ERR_REGRMB	0x99990003  /*	 reg rmb failed		      */
 
 struct smc_clc_msg_hdr {	/* header1 of clc messages */
 	u8 eyecatcher[4];	/* eye catcher */
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index a3cf7313a2d3..dbf64a93d68a 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -79,6 +79,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
 			   struct nlattr *bc)
 {
 	struct smc_sock *smc = smc_sk(sk);
+	struct smc_diag_fallback fallback;
 	struct user_namespace *user_ns;
 	struct smc_diag_msg *r;
 	struct nlmsghdr *nlh;
@@ -101,6 +102,11 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
 	if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
 		goto errout;
 
+	fallback.reason = smc->fallback_rsn;
+	fallback.peer_diagnosis = smc->peer_diagnosis;
+	if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0)
+		goto errout;
+
 	if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) &&
 	    smc->conn.alert_token_local) {
 		struct smc_connection *conn = &smc->conn;
-- 
cgit v1.2.3


From d396e47fb558a819226955ce5db0149fde88da0f Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Mon, 21 May 2018 11:28:51 +0300
Subject: usb: gadget: uvc: Move userspace API definition to public header

The UVC gadget userspace API (V4L2 events and custom ioctls) is defined
in a header internal to the kernel. Move it to a new public header to
make it accessible to userspace.

The UVC_INTF_CONTROL and UVC_INTF_STREAMING macros are not used, so
remove them in the process.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 MAINTAINERS                       |  1 +
 drivers/usb/gadget/function/uvc.h | 45 +++++----------------------------------
 include/uapi/linux/usb/g_uvc.h    | 39 +++++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 40 deletions(-)
 create mode 100644 include/uapi/linux/usb/g_uvc.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 0fe4228f78cb..37035a0c7522 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14996,6 +14996,7 @@ L:	linux-usb@vger.kernel.org
 S:	Maintained
 F:	drivers/usb/gadget/function/*uvc*
 F:	drivers/usb/gadget/legacy/webcam.c
+F:	include/uapi/linux/usb/g_uvc.h
 
 USB WIRELESS RNDIS DRIVER (rndis_wlan)
 M:	Jussi Kivilinna <jussi.kivilinna@iki.fi>
diff --git a/drivers/usb/gadget/function/uvc.h b/drivers/usb/gadget/function/uvc.h
index a64e07e61f8c..053e4b72039d 100644
--- a/drivers/usb/gadget/function/uvc.h
+++ b/drivers/usb/gadget/function/uvc.h
@@ -9,52 +9,20 @@
 #ifndef _UVC_GADGET_H_
 #define _UVC_GADGET_H_
 
-#include <linux/ioctl.h>
-#include <linux/types.h>
-#include <linux/usb/ch9.h>
-
-#define UVC_EVENT_FIRST			(V4L2_EVENT_PRIVATE_START + 0)
-#define UVC_EVENT_CONNECT		(V4L2_EVENT_PRIVATE_START + 0)
-#define UVC_EVENT_DISCONNECT		(V4L2_EVENT_PRIVATE_START + 1)
-#define UVC_EVENT_STREAMON		(V4L2_EVENT_PRIVATE_START + 2)
-#define UVC_EVENT_STREAMOFF		(V4L2_EVENT_PRIVATE_START + 3)
-#define UVC_EVENT_SETUP			(V4L2_EVENT_PRIVATE_START + 4)
-#define UVC_EVENT_DATA			(V4L2_EVENT_PRIVATE_START + 5)
-#define UVC_EVENT_LAST			(V4L2_EVENT_PRIVATE_START + 5)
-
-struct uvc_request_data {
-	__s32 length;
-	__u8 data[60];
-};
-
-struct uvc_event {
-	union {
-		enum usb_device_speed speed;
-		struct usb_ctrlrequest req;
-		struct uvc_request_data data;
-	};
-};
-
-#define UVCIOC_SEND_RESPONSE		_IOW('U', 1, struct uvc_request_data)
-
-#define UVC_INTF_CONTROL		0
-#define UVC_INTF_STREAMING		1
-
-/* ------------------------------------------------------------------------
- * Debugging, printing and logging
- */
-
-#ifdef __KERNEL__
-
 #include <linux/usb.h>	/* For usb_endpoint_* */
 #include <linux/usb/composite.h>
 #include <linux/usb/gadget.h>
+#include <linux/usb/g_uvc.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-fh.h>
 #include <media/v4l2-device.h>
 
 #include "uvc_queue.h"
 
+/* ------------------------------------------------------------------------
+ * Debugging, printing and logging
+ */
+
 #define UVC_TRACE_PROBE				(1 << 0)
 #define UVC_TRACE_DESCR				(1 << 1)
 #define UVC_TRACE_CONTROL			(1 << 2)
@@ -184,7 +152,4 @@ extern void uvc_endpoint_stream(struct uvc_device *dev);
 extern void uvc_function_connect(struct uvc_device *uvc);
 extern void uvc_function_disconnect(struct uvc_device *uvc);
 
-#endif /* __KERNEL__ */
-
 #endif /* _UVC_GADGET_H_ */
-
diff --git a/include/uapi/linux/usb/g_uvc.h b/include/uapi/linux/usb/g_uvc.h
new file mode 100644
index 000000000000..3c9ee3020cbb
--- /dev/null
+++ b/include/uapi/linux/usb/g_uvc.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * g_uvc.h  --  USB Video Class Gadget driver API
+ *
+ * Copyright (C) 2009-2010 Laurent Pinchart <laurent.pinchart@ideasonboard.com>
+ */
+
+#ifndef __LINUX_USB_G_UVC_H
+#define __LINUX_USB_G_UVC_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+#include <linux/usb/ch9.h>
+
+#define UVC_EVENT_FIRST			(V4L2_EVENT_PRIVATE_START + 0)
+#define UVC_EVENT_CONNECT		(V4L2_EVENT_PRIVATE_START + 0)
+#define UVC_EVENT_DISCONNECT		(V4L2_EVENT_PRIVATE_START + 1)
+#define UVC_EVENT_STREAMON		(V4L2_EVENT_PRIVATE_START + 2)
+#define UVC_EVENT_STREAMOFF		(V4L2_EVENT_PRIVATE_START + 3)
+#define UVC_EVENT_SETUP			(V4L2_EVENT_PRIVATE_START + 4)
+#define UVC_EVENT_DATA			(V4L2_EVENT_PRIVATE_START + 5)
+#define UVC_EVENT_LAST			(V4L2_EVENT_PRIVATE_START + 5)
+
+struct uvc_request_data {
+	__s32 length;
+	__u8 data[60];
+};
+
+struct uvc_event {
+	union {
+		enum usb_device_speed speed;
+		struct usb_ctrlrequest req;
+		struct uvc_request_data data;
+	};
+};
+
+#define UVCIOC_SEND_RESPONSE		_IOW('U', 1, struct uvc_request_data)
+
+#endif /* __LINUX_USB_G_UVC_H */
-- 
cgit v1.2.3


From 179909ecafc3bae1f34289e88bacd45e391f0554 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Tue, 24 Jul 2018 11:38:14 -0700
Subject: Input: stop telling users to snail-mail Vojtech

I do not think Vojtech wants snail mail these days (and he mentioned that
nobody has ever sent him snail mail), and the address is not even valid
anymore, so let's remove snail-mail instructions from the sources.

Acked-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/evbug.c                          | 4 ----
 drivers/input/gameport/emu10k1-gp.c            | 4 ----
 drivers/input/gameport/lightning.c             | 4 ----
 drivers/input/gameport/ns558.c                 | 4 ----
 drivers/input/joystick/a3d.c                   | 4 ----
 drivers/input/joystick/adi.c                   | 4 ----
 drivers/input/joystick/amijoy.c                | 4 ----
 drivers/input/joystick/analog.c                | 4 ----
 drivers/input/joystick/cobra.c                 | 4 ----
 drivers/input/joystick/db9.c                   | 4 ----
 drivers/input/joystick/gamecon.c               | 4 ----
 drivers/input/joystick/gf2k.c                  | 4 ----
 drivers/input/joystick/grip.c                  | 4 ----
 drivers/input/joystick/guillemot.c             | 4 ----
 drivers/input/joystick/iforce/iforce-ff.c      | 4 ----
 drivers/input/joystick/iforce/iforce-main.c    | 4 ----
 drivers/input/joystick/iforce/iforce-packets.c | 4 ----
 drivers/input/joystick/iforce/iforce-serio.c   | 4 ----
 drivers/input/joystick/iforce/iforce-usb.c     | 4 ----
 drivers/input/joystick/iforce/iforce.h         | 4 ----
 drivers/input/joystick/interact.c              | 4 ----
 drivers/input/joystick/joydump.c               | 4 ----
 drivers/input/joystick/magellan.c              | 4 ----
 drivers/input/joystick/sidewinder.c            | 4 ----
 drivers/input/joystick/spaceball.c             | 4 ----
 drivers/input/joystick/spaceorb.c              | 4 ----
 drivers/input/joystick/stinger.c               | 4 ----
 drivers/input/joystick/tmdc.c                  | 4 ----
 drivers/input/joystick/turbografx.c            | 4 ----
 drivers/input/joystick/warrior.c               | 4 ----
 drivers/input/keyboard/amikbd.c                | 4 ----
 drivers/input/keyboard/atakbd.c                | 4 ----
 drivers/input/keyboard/newtonkbd.c             | 4 ----
 drivers/input/keyboard/stowaway.c              | 4 ----
 drivers/input/keyboard/sunkbd.c                | 4 ----
 drivers/input/keyboard/xtkbd.c                 | 4 ----
 drivers/input/mouse/inport.c                   | 4 ----
 drivers/input/mouse/logibm.c                   | 4 ----
 drivers/input/mouse/pc110pad.c                 | 4 ----
 drivers/input/mouse/sermouse.c                 | 4 ----
 drivers/input/serio/ct82c710.c                 | 4 ----
 drivers/input/serio/q40kbd.c                   | 4 ----
 drivers/input/serio/rpckbd.c                   | 4 ----
 drivers/input/serio/serio.c                    | 4 ----
 drivers/input/touchscreen/gunze.c              | 4 ----
 include/linux/joystick.h                       | 4 ----
 include/uapi/linux/joystick.h                  | 4 ----
 47 files changed, 188 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/input/evbug.c b/drivers/input/evbug.c
index cd4e6679d61a..5419c1c1f621 100644
--- a/drivers/input/evbug.c
+++ b/drivers/input/evbug.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/input/gameport/emu10k1-gp.c b/drivers/input/gameport/emu10k1-gp.c
index 2909e9561cf3..afdc20ca0e24 100644
--- a/drivers/input/gameport/emu10k1-gp.c
+++ b/drivers/input/gameport/emu10k1-gp.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <asm/io.h>
diff --git a/drivers/input/gameport/lightning.c b/drivers/input/gameport/lightning.c
index 85d6ee09f11f..c6e74c7945cb 100644
--- a/drivers/input/gameport/lightning.c
+++ b/drivers/input/gameport/lightning.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <asm/io.h>
diff --git a/drivers/input/gameport/ns558.c b/drivers/input/gameport/ns558.c
index 7c217848613e..6437645858f9 100644
--- a/drivers/input/gameport/ns558.c
+++ b/drivers/input/gameport/ns558.c
@@ -21,10 +21,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <asm/io.h>
diff --git a/drivers/input/joystick/a3d.c b/drivers/input/joystick/a3d.c
index 55efdfc7eb62..98307039a534 100644
--- a/drivers/input/joystick/a3d.c
+++ b/drivers/input/joystick/a3d.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/adi.c b/drivers/input/joystick/adi.c
index 15a71acb6997..f466c0d34247 100644
--- a/drivers/input/joystick/adi.c
+++ b/drivers/input/joystick/adi.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/delay.h>
diff --git a/drivers/input/joystick/amijoy.c b/drivers/input/joystick/amijoy.c
index c65b5fa69f1e..2b82a838c511 100644
--- a/drivers/input/joystick/amijoy.c
+++ b/drivers/input/joystick/amijoy.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/types.h>
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index c79dbcb4d146..2b445c8d3fcd 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/delay.h>
diff --git a/drivers/input/joystick/cobra.c b/drivers/input/joystick/cobra.c
index ae3ee24a2368..14cb956beac4 100644
--- a/drivers/input/joystick/cobra.c
+++ b/drivers/input/joystick/cobra.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/db9.c b/drivers/input/joystick/db9.c
index de0dd4756c84..7721cc0bf569 100644
--- a/drivers/input/joystick/db9.c
+++ b/drivers/input/joystick/db9.c
@@ -23,10 +23,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/gamecon.c b/drivers/input/joystick/gamecon.c
index 4e10ffdf8a36..d62e73dd9f7f 100644
--- a/drivers/input/joystick/gamecon.c
+++ b/drivers/input/joystick/gamecon.c
@@ -24,10 +24,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/input/joystick/gf2k.c b/drivers/input/joystick/gf2k.c
index 0f519db64748..50a60065ab14 100644
--- a/drivers/input/joystick/gf2k.c
+++ b/drivers/input/joystick/gf2k.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/delay.h>
diff --git a/drivers/input/joystick/grip.c b/drivers/input/joystick/grip.c
index eac9c5b8d73e..e10395ba62bc 100644
--- a/drivers/input/joystick/grip.c
+++ b/drivers/input/joystick/grip.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/guillemot.c b/drivers/input/joystick/guillemot.c
index a9ac2f9cfce0..43ff817d80ac 100644
--- a/drivers/input/joystick/guillemot.c
+++ b/drivers/input/joystick/guillemot.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/iforce/iforce-ff.c b/drivers/input/joystick/iforce/iforce-ff.c
index 0de9a0943a9e..2ed923874bbc 100644
--- a/drivers/input/joystick/iforce/iforce-ff.c
+++ b/drivers/input/joystick/iforce/iforce-ff.c
@@ -19,10 +19,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include "iforce.h"
diff --git a/drivers/input/joystick/iforce/iforce-main.c b/drivers/input/joystick/iforce/iforce-main.c
index daeeb4c7e3b0..95bbd61d3fbe 100644
--- a/drivers/input/joystick/iforce/iforce-main.c
+++ b/drivers/input/joystick/iforce/iforce-main.c
@@ -19,10 +19,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include "iforce.h"
diff --git a/drivers/input/joystick/iforce/iforce-packets.c b/drivers/input/joystick/iforce/iforce-packets.c
index fedaaea5c16d..1cea950276dd 100644
--- a/drivers/input/joystick/iforce/iforce-packets.c
+++ b/drivers/input/joystick/iforce/iforce-packets.c
@@ -19,10 +19,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include "iforce.h"
diff --git a/drivers/input/joystick/iforce/iforce-serio.c b/drivers/input/joystick/iforce/iforce-serio.c
index 154e827b559b..f4ba4a751fe0 100644
--- a/drivers/input/joystick/iforce/iforce-serio.c
+++ b/drivers/input/joystick/iforce/iforce-serio.c
@@ -19,10 +19,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include "iforce.h"
diff --git a/drivers/input/joystick/iforce/iforce-usb.c b/drivers/input/joystick/iforce/iforce-usb.c
index e8724f1a4a25..78073259c9a1 100644
--- a/drivers/input/joystick/iforce/iforce-usb.c
+++ b/drivers/input/joystick/iforce/iforce-usb.c
@@ -19,10 +19,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include "iforce.h"
diff --git a/drivers/input/joystick/iforce/iforce.h b/drivers/input/joystick/iforce/iforce.h
index 96ae4f5bd0eb..aee43d14f699 100644
--- a/drivers/input/joystick/iforce/iforce.h
+++ b/drivers/input/joystick/iforce/iforce.h
@@ -19,10 +19,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/interact.c b/drivers/input/joystick/interact.c
index 17c2c800743c..598788b3da62 100644
--- a/drivers/input/joystick/interact.c
+++ b/drivers/input/joystick/interact.c
@@ -23,10 +23,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/joydump.c b/drivers/input/joystick/joydump.c
index d1c6e4846a4a..2ea05ade4d4e 100644
--- a/drivers/input/joystick/joydump.c
+++ b/drivers/input/joystick/joydump.c
@@ -21,10 +21,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/module.h>
diff --git a/drivers/input/joystick/magellan.c b/drivers/input/joystick/magellan.c
index a9d0e3edca94..95a34ab34fc3 100644
--- a/drivers/input/joystick/magellan.c
+++ b/drivers/input/joystick/magellan.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *  Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/sidewinder.c b/drivers/input/joystick/sidewinder.c
index 5e602a6852b7..f46bf4d41972 100644
--- a/drivers/input/joystick/sidewinder.c
+++ b/drivers/input/joystick/sidewinder.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/delay.h>
diff --git a/drivers/input/joystick/spaceball.c b/drivers/input/joystick/spaceball.c
index bb3faeff8cac..ffb9c1f495b6 100644
--- a/drivers/input/joystick/spaceball.c
+++ b/drivers/input/joystick/spaceball.c
@@ -24,10 +24,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *  Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/spaceorb.c b/drivers/input/joystick/spaceorb.c
index 05da0ed514e2..20540ee71d7f 100644
--- a/drivers/input/joystick/spaceorb.c
+++ b/drivers/input/joystick/spaceorb.c
@@ -23,10 +23,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *  Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/stinger.c b/drivers/input/joystick/stinger.c
index cb10e7b097ae..ba8579435d6c 100644
--- a/drivers/input/joystick/stinger.c
+++ b/drivers/input/joystick/stinger.c
@@ -21,10 +21,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *  Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/tmdc.c b/drivers/input/joystick/tmdc.c
index 7e17cde464f0..6f4a01cfe79f 100644
--- a/drivers/input/joystick/tmdc.c
+++ b/drivers/input/joystick/tmdc.c
@@ -23,10 +23,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/delay.h>
diff --git a/drivers/input/joystick/turbografx.c b/drivers/input/joystick/turbografx.c
index e2685753e460..bf2f9925e416 100644
--- a/drivers/input/joystick/turbografx.c
+++ b/drivers/input/joystick/turbografx.c
@@ -23,10 +23,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/joystick/warrior.c b/drivers/input/joystick/warrior.c
index ef5391ba4470..b60cab168e2a 100644
--- a/drivers/input/joystick/warrior.c
+++ b/drivers/input/joystick/warrior.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *  Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/input/keyboard/amikbd.c b/drivers/input/keyboard/amikbd.c
index e04a3b4e55d6..420e33c49e58 100644
--- a/drivers/input/keyboard/amikbd.c
+++ b/drivers/input/keyboard/amikbd.c
@@ -23,10 +23,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/module.h>
diff --git a/drivers/input/keyboard/atakbd.c b/drivers/input/keyboard/atakbd.c
index f1235831283d..6f62da2909ec 100644
--- a/drivers/input/keyboard/atakbd.c
+++ b/drivers/input/keyboard/atakbd.c
@@ -34,10 +34,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/module.h>
diff --git a/drivers/input/keyboard/newtonkbd.c b/drivers/input/keyboard/newtonkbd.c
index fb9b8e23ab93..de26e2df0ad5 100644
--- a/drivers/input/keyboard/newtonkbd.c
+++ b/drivers/input/keyboard/newtonkbd.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <j.cormack@doc.ic.ac.uk>, or by paper mail:
- * Justin Cormack, 68 Dartmouth Park Road, London NW5 1SN, UK.
  */
 
 #include <linux/slab.h>
diff --git a/drivers/input/keyboard/stowaway.c b/drivers/input/keyboard/stowaway.c
index 8b6de9a692dc..15a5e74dbe91 100644
--- a/drivers/input/keyboard/stowaway.c
+++ b/drivers/input/keyboard/stowaway.c
@@ -23,10 +23,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <marek.vasut@gmail.com>, or by paper mail:
- * Marek Vasut, Liskovecka 559, Frydek-Mistek, 738 01 Czech Republic
  */
 
 #include <linux/slab.h>
diff --git a/drivers/input/keyboard/sunkbd.c b/drivers/input/keyboard/sunkbd.c
index c95707ea2656..ad5d7f94f95a 100644
--- a/drivers/input/keyboard/sunkbd.c
+++ b/drivers/input/keyboard/sunkbd.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/delay.h>
diff --git a/drivers/input/keyboard/xtkbd.c b/drivers/input/keyboard/xtkbd.c
index 8f64b9ded8d0..f7598114b962 100644
--- a/drivers/input/keyboard/xtkbd.c
+++ b/drivers/input/keyboard/xtkbd.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/slab.h>
diff --git a/drivers/input/mouse/inport.c b/drivers/input/mouse/inport.c
index 9ce71dfa0de1..b9e68606c44a 100644
--- a/drivers/input/mouse/inport.c
+++ b/drivers/input/mouse/inport.c
@@ -26,10 +26,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/module.h>
diff --git a/drivers/input/mouse/logibm.c b/drivers/input/mouse/logibm.c
index 6f165e053f4d..2fd6c84cd5b7 100644
--- a/drivers/input/mouse/logibm.c
+++ b/drivers/input/mouse/logibm.c
@@ -27,10 +27,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/module.h>
diff --git a/drivers/input/mouse/pc110pad.c b/drivers/input/mouse/pc110pad.c
index 7b02b652e267..b8965e6bc890 100644
--- a/drivers/input/mouse/pc110pad.c
+++ b/drivers/input/mouse/pc110pad.c
@@ -23,10 +23,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/module.h>
diff --git a/drivers/input/mouse/sermouse.c b/drivers/input/mouse/sermouse.c
index 8df526620ebf..d27816ac038f 100644
--- a/drivers/input/mouse/sermouse.c
+++ b/drivers/input/mouse/sermouse.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/delay.h>
diff --git a/drivers/input/serio/ct82c710.c b/drivers/input/serio/ct82c710.c
index 9c54c43c9749..2d1e2993b5a8 100644
--- a/drivers/input/serio/ct82c710.c
+++ b/drivers/input/serio/ct82c710.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/delay.h>
diff --git a/drivers/input/serio/q40kbd.c b/drivers/input/serio/q40kbd.c
index d0fccc8ec259..fbb6b33845fa 100644
--- a/drivers/input/serio/q40kbd.c
+++ b/drivers/input/serio/q40kbd.c
@@ -23,10 +23,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/module.h>
diff --git a/drivers/input/serio/rpckbd.c b/drivers/input/serio/rpckbd.c
index 8cf964736902..a308d7811427 100644
--- a/drivers/input/serio/rpckbd.c
+++ b/drivers/input/serio/rpckbd.c
@@ -21,10 +21,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/module.h>
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index 24a90c8db5b3..2e1fb0649260 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/input/touchscreen/gunze.c b/drivers/input/touchscreen/gunze.c
index 481586909d28..054c2537b392 100644
--- a/drivers/input/touchscreen/gunze.c
+++ b/drivers/input/touchscreen/gunze.c
@@ -20,10 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
  */
 
 #include <linux/errno.h>
diff --git a/include/linux/joystick.h b/include/linux/joystick.h
index cbf2aa9e93b9..5153f5b9294c 100644
--- a/include/linux/joystick.h
+++ b/include/linux/joystick.h
@@ -17,10 +17,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- * 
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@suse.cz>, or by paper mail:
- * Vojtech Pavlik, Ucitelska 1576, Prague 8, 182 00 Czech Republic
  */
 #ifndef _LINUX_JOYSTICK_H
 #define _LINUX_JOYSTICK_H
diff --git a/include/uapi/linux/joystick.h b/include/uapi/linux/joystick.h
index 64aabb84a66d..192bf2cf182d 100644
--- a/include/uapi/linux/joystick.h
+++ b/include/uapi/linux/joystick.h
@@ -18,10 +18,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- * 
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@suse.cz>, or by paper mail:
- * Vojtech Pavlik, Ucitelska 1576, Prague 8, 182 00 Czech Republic
  */
 #ifndef _UAPI_LINUX_JOYSTICK_H
 #define _UAPI_LINUX_JOYSTICK_H
-- 
cgit v1.2.3


From 3570a00841fb8a5d2f56ac7c59ccc6c91ea35944 Mon Sep 17 00:00:00 2001
From: Dan Murphy <dmurphy@ti.com>
Date: Tue, 29 May 2018 15:26:12 -0500
Subject: can: uapi: can.h: Fix can error class mask dir path

The CAN error masks header file is in the
include/uapi directory.

Fix the path in the header to the correct location.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h
index d7f97ac197a9..0afb7d8e867f 100644
--- a/include/uapi/linux/can.h
+++ b/include/uapi/linux/can.h
@@ -77,7 +77,7 @@ typedef __u32 canid_t;
 /*
  * Controller Area Network Error Message Frame Mask structure
  *
- * bit 0-28	: error class mask (see include/linux/can/error.h)
+ * bit 0-28	: error class mask (see include/uapi/linux/can/error.h)
  * bit 29-31	: set to zero
  */
 typedef __u32 can_err_mask_t;
-- 
cgit v1.2.3


From e5225c820c057537dc780244760e2e24c7d27366 Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <guennadi.liakhovetski@intel.com>
Date: Thu, 26 Jul 2018 04:17:53 -0400
Subject: media: uvcvideo: Send a control event when a Control Change interrupt
 arrives

UVC defines a method of handling asynchronous controls, which sends a
USB packet over the interrupt pipe. This patch implements support for
such packets by sending a control event to the user. Since this can
involve USB traffic and, therefore, scheduling, this has to be done
in a work queue.

Signed-off-by: Guennadi Liakhovetski <guennadi.liakhovetski@intel.com>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/usb/uvc/uvc_ctrl.c   | 211 ++++++++++++++++++++++++++++---------
 drivers/media/usb/uvc/uvc_status.c | 121 ++++++++++++++++++---
 drivers/media/usb/uvc/uvc_v4l2.c   |   4 +-
 drivers/media/usb/uvc/uvcvideo.h   |  15 ++-
 include/uapi/linux/uvcvideo.h      |   2 +
 5 files changed, 286 insertions(+), 67 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/usb/uvc/uvc_ctrl.c b/drivers/media/usb/uvc/uvc_ctrl.c
index 12b5be66fd2f..c2ad102bd693 100644
--- a/drivers/media/usb/uvc/uvc_ctrl.c
+++ b/drivers/media/usb/uvc/uvc_ctrl.c
@@ -20,6 +20,7 @@
 #include <linux/videodev2.h>
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
+#include <linux/workqueue.h>
 #include <linux/atomic.h>
 #include <media/v4l2-ctrls.h>
 
@@ -971,12 +972,30 @@ static int uvc_ctrl_populate_cache(struct uvc_video_chain *chain,
 	return 0;
 }
 
+static s32 __uvc_ctrl_get_value(struct uvc_control_mapping *mapping,
+				const u8 *data)
+{
+	s32 value = mapping->get(mapping, UVC_GET_CUR, data);
+
+	if (mapping->v4l2_type == V4L2_CTRL_TYPE_MENU) {
+		struct uvc_menu_info *menu = mapping->menu_info;
+		unsigned int i;
+
+		for (i = 0; i < mapping->menu_count; ++i, ++menu) {
+			if (menu->value == value) {
+				value = i;
+				break;
+			}
+		}
+	}
+
+	return value;
+}
+
 static int __uvc_ctrl_get(struct uvc_video_chain *chain,
 	struct uvc_control *ctrl, struct uvc_control_mapping *mapping,
 	s32 *value)
 {
-	struct uvc_menu_info *menu;
-	unsigned int i;
 	int ret;
 
 	if ((ctrl->info.flags & UVC_CTRL_FLAG_GET_CUR) == 0)
@@ -993,18 +1012,8 @@ static int __uvc_ctrl_get(struct uvc_video_chain *chain,
 		ctrl->loaded = 1;
 	}
 
-	*value = mapping->get(mapping, UVC_GET_CUR,
-		uvc_ctrl_data(ctrl, UVC_CTRL_DATA_CURRENT));
-
-	if (mapping->v4l2_type == V4L2_CTRL_TYPE_MENU) {
-		menu = mapping->menu_info;
-		for (i = 0; i < mapping->menu_count; ++i, ++menu) {
-			if (menu->value == *value) {
-				*value = i;
-				break;
-			}
-		}
-	}
+	*value = __uvc_ctrl_get_value(mapping,
+				uvc_ctrl_data(ctrl, UVC_CTRL_DATA_CURRENT));
 
 	return 0;
 }
@@ -1216,53 +1225,135 @@ static void uvc_ctrl_fill_event(struct uvc_video_chain *chain,
 	ev->u.ctrl.default_value = v4l2_ctrl.default_value;
 }
 
-static void uvc_ctrl_send_event(struct uvc_fh *handle,
-	struct uvc_control *ctrl, struct uvc_control_mapping *mapping,
-	s32 value, u32 changes)
+/*
+ * Send control change events to all subscribers for the @ctrl control. By
+ * default the subscriber that generated the event, as identified by @handle,
+ * is not notified unless it has set the V4L2_EVENT_SUB_FL_ALLOW_FEEDBACK flag.
+ * @handle can be NULL for asynchronous events related to auto-update controls,
+ * in which case all subscribers are notified.
+ */
+static void uvc_ctrl_send_event(struct uvc_video_chain *chain,
+	struct uvc_fh *handle, struct uvc_control *ctrl,
+	struct uvc_control_mapping *mapping, s32 value, u32 changes)
 {
+	struct v4l2_fh *originator = handle ? &handle->vfh : NULL;
 	struct v4l2_subscribed_event *sev;
 	struct v4l2_event ev;
 
 	if (list_empty(&mapping->ev_subs))
 		return;
 
-	uvc_ctrl_fill_event(handle->chain, &ev, ctrl, mapping, value, changes);
+	uvc_ctrl_fill_event(chain, &ev, ctrl, mapping, value, changes);
 
 	list_for_each_entry(sev, &mapping->ev_subs, node) {
-		if (sev->fh != &handle->vfh ||
+		if (sev->fh != originator ||
 		    (sev->flags & V4L2_EVENT_SUB_FL_ALLOW_FEEDBACK) ||
 		    (changes & V4L2_EVENT_CTRL_CH_FLAGS))
 			v4l2_event_queue_fh(sev->fh, &ev);
 	}
 }
 
-static void uvc_ctrl_send_slave_event(struct uvc_fh *handle,
-	struct uvc_control *master, u32 slave_id,
-	const struct v4l2_ext_control *xctrls, unsigned int xctrls_count)
+/*
+ * Send control change events for the slave of the @master control identified
+ * by the V4L2 ID @slave_id. The @handle identifies the event subscriber that
+ * generated the event and may be NULL for auto-update events.
+ */
+static void uvc_ctrl_send_slave_event(struct uvc_video_chain *chain,
+	struct uvc_fh *handle, struct uvc_control *master, u32 slave_id)
 {
 	struct uvc_control_mapping *mapping = NULL;
 	struct uvc_control *ctrl = NULL;
 	u32 changes = V4L2_EVENT_CTRL_CH_FLAGS;
-	unsigned int i;
 	s32 val = 0;
 
-	/*
-	 * We can skip sending an event for the slave if the slave
-	 * is being modified in the same transaction.
-	 */
-	for (i = 0; i < xctrls_count; i++) {
-		if (xctrls[i].id == slave_id)
-			return;
-	}
-
 	__uvc_find_control(master->entity, slave_id, &mapping, &ctrl, 0);
 	if (ctrl == NULL)
 		return;
 
-	if (__uvc_ctrl_get(handle->chain, ctrl, mapping, &val) == 0)
+	if (__uvc_ctrl_get(chain, ctrl, mapping, &val) == 0)
 		changes |= V4L2_EVENT_CTRL_CH_VALUE;
 
-	uvc_ctrl_send_event(handle, ctrl, mapping, val, changes);
+	uvc_ctrl_send_event(chain, handle, ctrl, mapping, val, changes);
+}
+
+static void uvc_ctrl_status_event_work(struct work_struct *work)
+{
+	struct uvc_device *dev = container_of(work, struct uvc_device,
+					      async_ctrl.work);
+	struct uvc_ctrl_work *w = &dev->async_ctrl;
+	struct uvc_video_chain *chain = w->chain;
+	struct uvc_control_mapping *mapping;
+	struct uvc_control *ctrl = w->ctrl;
+	struct uvc_fh *handle;
+	unsigned int i;
+	int ret;
+
+	mutex_lock(&chain->ctrl_mutex);
+
+	handle = ctrl->handle;
+	ctrl->handle = NULL;
+
+	list_for_each_entry(mapping, &ctrl->info.mappings, list) {
+		s32 value = __uvc_ctrl_get_value(mapping, w->data);
+
+		/*
+		 * handle may be NULL here if the device sends auto-update
+		 * events without a prior related control set from userspace.
+		 */
+		for (i = 0; i < ARRAY_SIZE(mapping->slave_ids); ++i) {
+			if (!mapping->slave_ids[i])
+				break;
+
+			uvc_ctrl_send_slave_event(chain, handle, ctrl,
+						  mapping->slave_ids[i]);
+		}
+
+		uvc_ctrl_send_event(chain, handle, ctrl, mapping, value,
+				    V4L2_EVENT_CTRL_CH_VALUE);
+	}
+
+	mutex_unlock(&chain->ctrl_mutex);
+
+	/* Resubmit the URB. */
+	w->urb->interval = dev->int_ep->desc.bInterval;
+	ret = usb_submit_urb(w->urb, GFP_KERNEL);
+	if (ret < 0)
+		uvc_printk(KERN_ERR, "Failed to resubmit status URB (%d).\n",
+			   ret);
+}
+
+bool uvc_ctrl_status_event(struct urb *urb, struct uvc_video_chain *chain,
+			   struct uvc_control *ctrl, const u8 *data)
+{
+	struct uvc_device *dev = chain->dev;
+	struct uvc_ctrl_work *w = &dev->async_ctrl;
+
+	if (list_empty(&ctrl->info.mappings)) {
+		ctrl->handle = NULL;
+		return false;
+	}
+
+	w->data = data;
+	w->urb = urb;
+	w->chain = chain;
+	w->ctrl = ctrl;
+
+	schedule_work(&w->work);
+
+	return true;
+}
+
+static bool uvc_ctrl_xctrls_has_control(const struct v4l2_ext_control *xctrls,
+					unsigned int xctrls_count, u32 id)
+{
+	unsigned int i;
+
+	for (i = 0; i < xctrls_count; ++i) {
+		if (xctrls[i].id == id)
+			return true;
+	}
+
+	return false;
 }
 
 static void uvc_ctrl_send_events(struct uvc_fh *handle,
@@ -1277,29 +1368,39 @@ static void uvc_ctrl_send_events(struct uvc_fh *handle,
 	for (i = 0; i < xctrls_count; ++i) {
 		ctrl = uvc_find_control(handle->chain, xctrls[i].id, &mapping);
 
+		if (ctrl->info.flags & UVC_CTRL_FLAG_ASYNCHRONOUS)
+			/* Notification will be sent from an Interrupt event. */
+			continue;
+
 		for (j = 0; j < ARRAY_SIZE(mapping->slave_ids); ++j) {
-			if (!mapping->slave_ids[j])
+			u32 slave_id = mapping->slave_ids[j];
+
+			if (!slave_id)
 				break;
-			uvc_ctrl_send_slave_event(handle, ctrl,
-						  mapping->slave_ids[j],
-						  xctrls, xctrls_count);
+
+			/*
+			 * We can skip sending an event for the slave if the
+			 * slave is being modified in the same transaction.
+			 */
+			if (uvc_ctrl_xctrls_has_control(xctrls, xctrls_count,
+							slave_id))
+				continue;
+
+			uvc_ctrl_send_slave_event(handle->chain, handle, ctrl,
+						  slave_id);
 		}
 
 		/*
 		 * If the master is being modified in the same transaction
 		 * flags may change too.
 		 */
-		if (mapping->master_id) {
-			for (j = 0; j < xctrls_count; j++) {
-				if (xctrls[j].id == mapping->master_id) {
-					changes |= V4L2_EVENT_CTRL_CH_FLAGS;
-					break;
-				}
-			}
-		}
+		if (mapping->master_id &&
+		    uvc_ctrl_xctrls_has_control(xctrls, xctrls_count,
+						mapping->master_id))
+			changes |= V4L2_EVENT_CTRL_CH_FLAGS;
 
-		uvc_ctrl_send_event(handle, ctrl, mapping, xctrls[i].value,
-				    changes);
+		uvc_ctrl_send_event(handle->chain, handle, ctrl, mapping,
+				    xctrls[i].value, changes);
 	}
 }
 
@@ -1472,9 +1573,10 @@ int uvc_ctrl_get(struct uvc_video_chain *chain,
 	return __uvc_ctrl_get(chain, ctrl, mapping, &xctrl->value);
 }
 
-int uvc_ctrl_set(struct uvc_video_chain *chain,
+int uvc_ctrl_set(struct uvc_fh *handle,
 	struct v4l2_ext_control *xctrl)
 {
+	struct uvc_video_chain *chain = handle->chain;
 	struct uvc_control *ctrl;
 	struct uvc_control_mapping *mapping;
 	s32 value;
@@ -1581,6 +1683,9 @@ int uvc_ctrl_set(struct uvc_video_chain *chain,
 	mapping->set(mapping, value,
 		uvc_ctrl_data(ctrl, UVC_CTRL_DATA_CURRENT));
 
+	if (ctrl->info.flags & UVC_CTRL_FLAG_ASYNCHRONOUS)
+		ctrl->handle = handle;
+
 	ctrl->dirty = 1;
 	ctrl->modified = 1;
 	return 0;
@@ -1612,7 +1717,9 @@ static int uvc_ctrl_get_flags(struct uvc_device *dev,
 			    |  (data[0] & UVC_CONTROL_CAP_SET ?
 				UVC_CTRL_FLAG_SET_CUR : 0)
 			    |  (data[0] & UVC_CONTROL_CAP_AUTOUPDATE ?
-				UVC_CTRL_FLAG_AUTO_UPDATE : 0);
+				UVC_CTRL_FLAG_AUTO_UPDATE : 0)
+			    |  (data[0] & UVC_CONTROL_CAP_ASYNCHRONOUS ?
+				UVC_CTRL_FLAG_ASYNCHRONOUS : 0);
 
 	kfree(data);
 	return ret;
@@ -2173,6 +2280,8 @@ int uvc_ctrl_init_device(struct uvc_device *dev)
 	struct uvc_entity *entity;
 	unsigned int i;
 
+	INIT_WORK(&dev->async_ctrl.work, uvc_ctrl_status_event_work);
+
 	/* Walk the entities list and instantiate controls */
 	list_for_each_entry(entity, &dev->entities, list) {
 		struct uvc_control *ctrl;
@@ -2241,6 +2350,8 @@ void uvc_ctrl_cleanup_device(struct uvc_device *dev)
 	struct uvc_entity *entity;
 	unsigned int i;
 
+	cancel_work_sync(&dev->async_ctrl.work);
+
 	/* Free controls and control mappings for all entities. */
 	list_for_each_entry(entity, &dev->entities, list) {
 		for (i = 0; i < entity->ncontrols; ++i) {
diff --git a/drivers/media/usb/uvc/uvc_status.c b/drivers/media/usb/uvc/uvc_status.c
index 7b710410584a..0722dc684378 100644
--- a/drivers/media/usb/uvc/uvc_status.c
+++ b/drivers/media/usb/uvc/uvc_status.c
@@ -78,7 +78,24 @@ static void uvc_input_report_key(struct uvc_device *dev, unsigned int code,
 /* --------------------------------------------------------------------------
  * Status interrupt endpoint
  */
-static void uvc_event_streaming(struct uvc_device *dev, u8 *data, int len)
+struct uvc_streaming_status {
+	u8	bStatusType;
+	u8	bOriginator;
+	u8	bEvent;
+	u8	bValue[];
+} __packed;
+
+struct uvc_control_status {
+	u8	bStatusType;
+	u8	bOriginator;
+	u8	bEvent;
+	u8	bSelector;
+	u8	bAttribute;
+	u8	bValue[];
+} __packed;
+
+static void uvc_event_streaming(struct uvc_device *dev,
+				struct uvc_streaming_status *status, int len)
 {
 	if (len < 3) {
 		uvc_trace(UVC_TRACE_STATUS, "Invalid streaming status event "
@@ -86,31 +103,97 @@ static void uvc_event_streaming(struct uvc_device *dev, u8 *data, int len)
 		return;
 	}
 
-	if (data[2] == 0) {
+	if (status->bEvent == 0) {
 		if (len < 4)
 			return;
 		uvc_trace(UVC_TRACE_STATUS, "Button (intf %u) %s len %d\n",
-			data[1], data[3] ? "pressed" : "released", len);
-		uvc_input_report_key(dev, KEY_CAMERA, data[3]);
+			  status->bOriginator,
+			  status->bValue[0] ? "pressed" : "released", len);
+		uvc_input_report_key(dev, KEY_CAMERA, status->bValue[0]);
 	} else {
 		uvc_trace(UVC_TRACE_STATUS,
 			  "Stream %u error event %02x len %d.\n",
-			  data[1], data[2], len);
+			  status->bOriginator, status->bEvent, len);
 	}
 }
 
-static void uvc_event_control(struct uvc_device *dev, u8 *data, int len)
+#define UVC_CTRL_VALUE_CHANGE	0
+#define UVC_CTRL_INFO_CHANGE	1
+#define UVC_CTRL_FAILURE_CHANGE	2
+#define UVC_CTRL_MIN_CHANGE	3
+#define UVC_CTRL_MAX_CHANGE	4
+
+static struct uvc_control *uvc_event_entity_find_ctrl(struct uvc_entity *entity,
+						      u8 selector)
 {
-	char *attrs[3] = { "value", "info", "failure" };
+	struct uvc_control *ctrl;
+	unsigned int i;
+
+	for (i = 0, ctrl = entity->controls; i < entity->ncontrols; i++, ctrl++)
+		if (ctrl->info.selector == selector)
+			return ctrl;
+
+	return NULL;
+}
 
-	if (len < 6 || data[2] != 0 || data[4] > 2) {
+static struct uvc_control *uvc_event_find_ctrl(struct uvc_device *dev,
+					const struct uvc_control_status *status,
+					struct uvc_video_chain **chain)
+{
+	list_for_each_entry((*chain), &dev->chains, list) {
+		struct uvc_entity *entity;
+		struct uvc_control *ctrl;
+
+		list_for_each_entry(entity, &(*chain)->entities, chain) {
+			if (entity->id != status->bOriginator)
+				continue;
+
+			ctrl = uvc_event_entity_find_ctrl(entity,
+							  status->bSelector);
+			if (ctrl)
+				return ctrl;
+		}
+	}
+
+	return NULL;
+}
+
+static bool uvc_event_control(struct urb *urb,
+			      const struct uvc_control_status *status, int len)
+{
+	static const char *attrs[] = { "value", "info", "failure", "min", "max" };
+	struct uvc_device *dev = urb->context;
+	struct uvc_video_chain *chain;
+	struct uvc_control *ctrl;
+
+	if (len < 6 || status->bEvent != 0 ||
+	    status->bAttribute >= ARRAY_SIZE(attrs)) {
 		uvc_trace(UVC_TRACE_STATUS, "Invalid control status event "
 				"received.\n");
-		return;
+		return false;
 	}
 
 	uvc_trace(UVC_TRACE_STATUS, "Control %u/%u %s change len %d.\n",
-		data[1], data[3], attrs[data[4]], len);
+		  status->bOriginator, status->bSelector,
+		  attrs[status->bAttribute], len);
+
+	/* Find the control. */
+	ctrl = uvc_event_find_ctrl(dev, status, &chain);
+	if (!ctrl)
+		return false;
+
+	switch (status->bAttribute) {
+	case UVC_CTRL_VALUE_CHANGE:
+		return uvc_ctrl_status_event(urb, chain, ctrl, status->bValue);
+
+	case UVC_CTRL_INFO_CHANGE:
+	case UVC_CTRL_FAILURE_CHANGE:
+	case UVC_CTRL_MIN_CHANGE:
+	case UVC_CTRL_MAX_CHANGE:
+		break;
+	}
+
+	return false;
 }
 
 static void uvc_status_complete(struct urb *urb)
@@ -138,13 +221,23 @@ static void uvc_status_complete(struct urb *urb)
 	len = urb->actual_length;
 	if (len > 0) {
 		switch (dev->status[0] & 0x0f) {
-		case UVC_STATUS_TYPE_CONTROL:
-			uvc_event_control(dev, dev->status, len);
+		case UVC_STATUS_TYPE_CONTROL: {
+			struct uvc_control_status *status =
+				(struct uvc_control_status *)dev->status;
+
+			if (uvc_event_control(urb, status, len))
+				/* The URB will be resubmitted in work context. */
+				return;
 			break;
+		}
 
-		case UVC_STATUS_TYPE_STREAMING:
-			uvc_event_streaming(dev, dev->status, len);
+		case UVC_STATUS_TYPE_STREAMING: {
+			struct uvc_streaming_status *status =
+				(struct uvc_streaming_status *)dev->status;
+
+			uvc_event_streaming(dev, status, len);
 			break;
+		}
 
 		default:
 			uvc_trace(UVC_TRACE_STATUS, "Unknown status event "
diff --git a/drivers/media/usb/uvc/uvc_v4l2.c b/drivers/media/usb/uvc/uvc_v4l2.c
index bd32914259ae..18a7384b50ee 100644
--- a/drivers/media/usb/uvc/uvc_v4l2.c
+++ b/drivers/media/usb/uvc/uvc_v4l2.c
@@ -994,7 +994,7 @@ static int uvc_ioctl_s_ctrl(struct file *file, void *fh,
 	if (ret < 0)
 		return ret;
 
-	ret = uvc_ctrl_set(chain, &xctrl);
+	ret = uvc_ctrl_set(handle, &xctrl);
 	if (ret < 0) {
 		uvc_ctrl_rollback(handle);
 		return ret;
@@ -1069,7 +1069,7 @@ static int uvc_ioctl_s_try_ext_ctrls(struct uvc_fh *handle,
 		return ret;
 
 	for (i = 0; i < ctrls->count; ++ctrl, ++i) {
-		ret = uvc_ctrl_set(chain, ctrl);
+		ret = uvc_ctrl_set(handle, ctrl);
 		if (ret < 0) {
 			uvc_ctrl_rollback(handle);
 			ctrls->error_idx = commit ? ctrls->count : i;
diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h
index 6b955e0dd956..e5f5d84f1d1d 100644
--- a/drivers/media/usb/uvc/uvcvideo.h
+++ b/drivers/media/usb/uvc/uvcvideo.h
@@ -12,6 +12,7 @@
 #include <linux/usb/video.h>
 #include <linux/uvcvideo.h>
 #include <linux/videodev2.h>
+#include <linux/workqueue.h>
 #include <media/media-device.h>
 #include <media/v4l2-device.h>
 #include <media/v4l2-event.h>
@@ -259,6 +260,8 @@ struct uvc_control {
 	   initialized:1;
 
 	u8 *uvc_data;
+
+	struct uvc_fh *handle;	/* File handle that last changed the control. */
 };
 
 struct uvc_format_desc {
@@ -603,6 +606,14 @@ struct uvc_device {
 	u8 *status;
 	struct input_dev *input;
 	char input_phys[64];
+
+	struct uvc_ctrl_work {
+		struct work_struct work;
+		struct urb *urb;
+		struct uvc_video_chain *chain;
+		struct uvc_control *ctrl;
+		const void *data;
+	} async_ctrl;
 };
 
 enum uvc_handle_state {
@@ -756,6 +767,8 @@ int uvc_ctrl_add_mapping(struct uvc_video_chain *chain,
 int uvc_ctrl_init_device(struct uvc_device *dev);
 void uvc_ctrl_cleanup_device(struct uvc_device *dev);
 int uvc_ctrl_restore_values(struct uvc_device *dev);
+bool uvc_ctrl_status_event(struct urb *urb, struct uvc_video_chain *chain,
+			   struct uvc_control *ctrl, const u8 *data);
 
 int uvc_ctrl_begin(struct uvc_video_chain *chain);
 int __uvc_ctrl_commit(struct uvc_fh *handle, int rollback,
@@ -773,7 +786,7 @@ static inline int uvc_ctrl_rollback(struct uvc_fh *handle)
 }
 
 int uvc_ctrl_get(struct uvc_video_chain *chain, struct v4l2_ext_control *xctrl);
-int uvc_ctrl_set(struct uvc_video_chain *chain, struct v4l2_ext_control *xctrl);
+int uvc_ctrl_set(struct uvc_fh *handle, struct v4l2_ext_control *xctrl);
 
 int uvc_xu_ctrl_query(struct uvc_video_chain *chain,
 		      struct uvc_xu_control_query *xqry);
diff --git a/include/uapi/linux/uvcvideo.h b/include/uapi/linux/uvcvideo.h
index 020714d2c5bd..f80f05b3c423 100644
--- a/include/uapi/linux/uvcvideo.h
+++ b/include/uapi/linux/uvcvideo.h
@@ -28,6 +28,8 @@
 #define UVC_CTRL_FLAG_RESTORE		(1 << 6)
 /* Control can be updated by the camera. */
 #define UVC_CTRL_FLAG_AUTO_UPDATE	(1 << 7)
+/* Control supports asynchronous reporting */
+#define UVC_CTRL_FLAG_ASYNCHRONOUS	(1 << 8)
 
 #define UVC_CTRL_FLAG_GET_RANGE \
 	(UVC_CTRL_FLAG_GET_CUR | UVC_CTRL_FLAG_GET_MIN | \
-- 
cgit v1.2.3


From d0dd962d8a4ef4df9b710c4e3a975e6bfd9f0225 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Mon, 28 May 2018 10:32:41 -0400
Subject: media: dvb: get rid of VIDEO_SET_SPU_PALETTE

No upstream drivers use it. It doesn't make any sense to have
a compat32 code for something that nobody uses upstream.

Reported-by: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 .../media/uapi/dvb/video-set-spu-palette.rst       | 82 ----------------------
 .../media/uapi/dvb/video_function_calls.rst        |  1 -
 Documentation/media/uapi/dvb/video_types.rst       | 18 -----
 Documentation/media/video.h.rst.exceptions         |  1 -
 fs/compat_ioctl.c                                  | 30 --------
 include/uapi/linux/dvb/video.h                     |  7 --
 6 files changed, 139 deletions(-)
 delete mode 100644 Documentation/media/uapi/dvb/video-set-spu-palette.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/dvb/video-set-spu-palette.rst b/Documentation/media/uapi/dvb/video-set-spu-palette.rst
deleted file mode 100644
index 51a1913d21d2..000000000000
--- a/Documentation/media/uapi/dvb/video-set-spu-palette.rst
+++ /dev/null
@@ -1,82 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _VIDEO_SET_SPU_PALETTE:
-
-=====================
-VIDEO_SET_SPU_PALETTE
-=====================
-
-Name
-----
-
-VIDEO_SET_SPU_PALETTE
-
-.. attention:: This ioctl is deprecated.
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, VIDEO_SET_SPU_PALETTE, struct video_spu_palette *palette )
-    :name: VIDEO_SET_SPU_PALETTE
-
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -  .. row 2
-
-       -  int request
-
-       -  Equals VIDEO_SET_SPU_PALETTE for this command.
-
-    -  .. row 3
-
-       -  video_spu_palette_t \*palette
-
-       -  SPU palette according to section ??.
-
-
-Description
------------
-
-This ioctl sets the SPU color palette.
-
-.. c:type:: video_spu_palette
-
-.. code-block::c
-
-	typedef struct video_spu_palette {      /* SPU Palette information */
-		int length;
-		__u8 __user *palette;
-	} video_spu_palette_t;
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
-
-
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ``EINVAL``
-
-       -  input is not a valid palette or driver doesn’t handle SPU.
diff --git a/Documentation/media/uapi/dvb/video_function_calls.rst b/Documentation/media/uapi/dvb/video_function_calls.rst
index 68588ac7fecb..8d8383ffaeba 100644
--- a/Documentation/media/uapi/dvb/video_function_calls.rst
+++ b/Documentation/media/uapi/dvb/video_function_calls.rst
@@ -38,6 +38,5 @@ Video Function Calls
     video-set-system
     video-set-highlight
     video-set-spu
-    video-set-spu-palette
     video-get-navi
     video-set-attributes
diff --git a/Documentation/media/uapi/dvb/video_types.rst b/Documentation/media/uapi/dvb/video_types.rst
index 640a21de6b8a..4cfa00e5c934 100644
--- a/Documentation/media/uapi/dvb/video_types.rst
+++ b/Documentation/media/uapi/dvb/video_types.rst
@@ -320,24 +320,6 @@ to the following format:
      } video_spu_t;
 
 
-.. c:type:: video_spu_palette
-
-struct video_spu_palette
-========================
-
-The following structure is used to set the SPU palette by calling
-VIDEO_SPU_PALETTE:
-
-
-.. code-block:: c
-
-     typedef
-     struct video_spu_palette {
-	 int length;
-	 uint8_t *palette;
-     } video_spu_palette_t;
-
-
 .. c:type:: video_navi_pack
 
 struct video_navi_pack
diff --git a/Documentation/media/video.h.rst.exceptions b/Documentation/media/video.h.rst.exceptions
index a91aa884ce0e..89d7c3ef2da7 100644
--- a/Documentation/media/video.h.rst.exceptions
+++ b/Documentation/media/video.h.rst.exceptions
@@ -36,5 +36,4 @@ replace typedef video_stream_source_t :c:type:`video_stream_source`
 replace typedef video_play_state_t :c:type:`video_play_state`
 replace typedef video_highlight_t :c:type:`video_highlight`
 replace typedef video_spu_t :c:type:`video_spu`
-replace typedef video_spu_palette_t :c:type:`video_spu_palette`
 replace typedef video_navi_pack_t :c:type:`video_navi_pack`
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 9907475b4226..fdb5ef9b5d06 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -198,34 +198,6 @@ static int do_video_stillpicture(struct file *file,
 	return err;
 }
 
-struct compat_video_spu_palette {
-	int length;
-	compat_uptr_t palette;
-};
-
-static int do_video_set_spu_palette(struct file *file,
-		unsigned int cmd, struct compat_video_spu_palette __user *up)
-{
-	struct video_spu_palette __user *up_native;
-	compat_uptr_t palp;
-	int length, err;
-
-	err  = get_user(palp, &up->palette);
-	err |= get_user(length, &up->length);
-	if (err)
-		return -EFAULT;
-
-	up_native = compat_alloc_user_space(sizeof(struct video_spu_palette));
-	err  = put_user(compat_ptr(palp), &up_native->palette);
-	err |= put_user(length, &up_native->length);
-	if (err)
-		return -EFAULT;
-
-	err = do_ioctl(file, cmd, (unsigned long) up_native);
-
-	return err;
-}
-
 #ifdef CONFIG_BLOCK
 typedef struct sg_io_hdr32 {
 	compat_int_t interface_id;	/* [i] 'S' for SCSI generic (required) */
@@ -1347,8 +1319,6 @@ static long do_ioctl_trans(unsigned int cmd,
 		return do_video_get_event(file, cmd, argp);
 	case VIDEO_STILLPICTURE:
 		return do_video_stillpicture(file, cmd, argp);
-	case VIDEO_SET_SPU_PALETTE:
-		return do_video_set_spu_palette(file, cmd, argp);
 	}
 
 	/*
diff --git a/include/uapi/linux/dvb/video.h b/include/uapi/linux/dvb/video.h
index df3d7028c807..6a0c9757b7ba 100644
--- a/include/uapi/linux/dvb/video.h
+++ b/include/uapi/linux/dvb/video.h
@@ -186,12 +186,6 @@ typedef struct video_spu {
 } video_spu_t;
 
 
-typedef struct video_spu_palette {      /* SPU Palette information */
-	int length;
-	__u8 __user *palette;
-} video_spu_palette_t;
-
-
 typedef struct video_navi_pack {
 	int length;          /* 0 ... 1024 */
 	__u8 data[1024];
@@ -248,7 +242,6 @@ typedef __u16 video_attributes_t;
 #define VIDEO_SET_SYSTEM           _IO('o', 38)
 #define VIDEO_SET_HIGHLIGHT        _IOW('o', 39, video_highlight_t)
 #define VIDEO_SET_SPU              _IOW('o', 50, video_spu_t)
-#define VIDEO_SET_SPU_PALETTE      _IOW('o', 51, video_spu_palette_t)
 #define VIDEO_GET_NAVI             _IOR('o', 52, video_navi_pack_t)
 #define VIDEO_SET_ATTRIBUTES       _IO('o', 53)
 #define VIDEO_GET_SIZE             _IOR('o', 55, video_size_t)
-- 
cgit v1.2.3


From 87e1a881e1fe3bfd89903c54ae0d6d6adedb76f1 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Wed, 25 Jul 2018 08:51:39 -0400
Subject: media: media.h: remove linux/version.h include

The media.h public header is one of only three public headers that include
linux/version.h. Drop it from media.h. It was only used for an obsolete
define.

It has to be added to media-device.c, since that source relied on media.h
to include it.

Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 drivers/media/media-device.c | 1 +
 include/uapi/linux/media.h   | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c
index 14959b19a342..fcdf3d5dc4b6 100644
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -25,6 +25,7 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/usb.h>
+#include <linux/version.h>
 
 #include <media/media-device.h>
 #include <media/media-devnode.h>
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 82ec9f132a53..36f76e777ef9 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -25,7 +25,6 @@
 #endif
 #include <linux/ioctl.h>
 #include <linux/types.h>
-#include <linux/version.h>
 
 struct media_device_info {
 	char driver[16];
@@ -421,7 +420,7 @@ struct media_v2_topology {
 #define MEDIA_INTF_T_ALSA_TIMER			(MEDIA_INTF_T_ALSA_BASE + 7)
 
 /* Obsolete symbol for media_version, no longer used in the kernel */
-#define MEDIA_API_VERSION			KERNEL_VERSION(0, 1, 0)
+#define MEDIA_API_VERSION			((0 << 16) | (1 << 8) | 0)
 
 #endif
 
-- 
cgit v1.2.3


From 75cbb3f1d840429e6aa67b351332f38b29e6292c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 26 Jul 2018 12:17:41 +0800
Subject: bcache: stop using the deprecated get_seconds()

The get_seconds function is deprecated now since it returns a 32-bit
value that will eventually overflow, and we are replacing it throughout
the kernel with ktime_get_seconds() or ktime_get_real_seconds() that
return a time64_t.

bcache uses get_seconds() to read the current system time and store it in
the superblock as well as in uuid_entry structures that are user visible.

Unfortunately, the two structures in are still limited to 32 bits, so this
won't fix any real problems but will still overflow in year 2106. Let's
at least document that properly, in case we get an updated format in the
future it can be fixed. We still have a long time before the overflow
and checking the tools at https://github.com/koverstreet/bcache-tools
reveals no access to any of them.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c   | 12 ++++++------
 include/uapi/linux/bcache.h |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 40fe26fef00f..e0a92104ca23 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -181,7 +181,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 		goto err;
 	}
 
-	sb->last_mount = get_seconds();
+	sb->last_mount = (u32)ktime_get_real_seconds();
 	err = NULL;
 
 	get_page(bh->b_page);
@@ -701,7 +701,7 @@ static void bcache_device_detach(struct bcache_device *d)
 
 		SET_UUID_FLASH_ONLY(u, 0);
 		memcpy(u->uuid, invalid_uuid, 16);
-		u->invalidated = cpu_to_le32(get_seconds());
+		u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
 		bch_uuid_write(d->c);
 	}
 
@@ -1033,7 +1033,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
 int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 			  uint8_t *set_uuid)
 {
-	uint32_t rtime = cpu_to_le32(get_seconds());
+	uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
 	struct uuid_entry *u;
 	struct cached_dev *exist_dc, *t;
 
@@ -1076,7 +1076,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 	    (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
 	     BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
 		memcpy(u->uuid, invalid_uuid, 16);
-		u->invalidated = cpu_to_le32(get_seconds());
+		u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
 		u = NULL;
 	}
 
@@ -1398,7 +1398,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
 
 	get_random_bytes(u->uuid, 16);
 	memset(u->label, 0, 32);
-	u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
+	u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
 
 	SET_UUID_FLASH_ONLY(u, 1);
 	u->sectors = size >> 9;
@@ -1902,7 +1902,7 @@ static void run_cache_set(struct cache_set *c)
 		goto err;
 
 	closure_sync(&cl);
-	c->sb.last_mount = get_seconds();
+	c->sb.last_mount = (u32)ktime_get_real_seconds();
 	bcache_write_super(c);
 
 	list_for_each_entry_safe(dc, t, &uncached_devices, list)
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 821f71a2e48f..8d19e02d752a 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -195,7 +195,7 @@ struct cache_sb {
 	};
 	};
 
-	__u32			last_mount;	/* time_t */
+	__u32			last_mount;	/* time overflow in y2106 */
 
 	__u16			first_bucket;
 	union {
@@ -318,7 +318,7 @@ struct uuid_entry {
 		struct {
 			__u8	uuid[16];
 			__u8	label[32];
-			__u32	first_reg;
+			__u32	first_reg; /* time overflow in y2106 */
 			__u32	last_reg;
 			__u32	invalidated;
 
-- 
cgit v1.2.3


From 3ae5536b808dced0af5b2e6768a41862620c779d Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 27 Jul 2018 10:59:57 +0200
Subject: l2tp: ignore L2TP_ATTR_DATA_SEQ netlink attribute

The value of this attribute is never used.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h | 7 ++++---
 net/l2tp/l2tp_core.h      | 8 --------
 net/l2tp/l2tp_debugfs.c   | 4 +---
 net/l2tp/l2tp_netlink.c   | 6 ------
 4 files changed, 5 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index 7d570c7bd117..ae888606b3ec 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -65,9 +65,9 @@ struct sockaddr_l2tpip6 {
  * TUNNEL_MODIFY	- CONN_ID, udpcsum
  * TUNNEL_GETSTATS	- CONN_ID, (stats)
  * TUNNEL_GET		- CONN_ID, (...)
- * SESSION_CREATE	- SESSION_ID, PW_TYPE, data_seq, cookie, peer_cookie, l2spec
+ * SESSION_CREATE	- SESSION_ID, PW_TYPE, cookie, peer_cookie, l2spec
  * SESSION_DELETE	- SESSION_ID
- * SESSION_MODIFY	- SESSION_ID, data_seq
+ * SESSION_MODIFY	- SESSION_ID
  * SESSION_GET		- SESSION_ID, (...)
  * SESSION_GETSTATS	- SESSION_ID, (stats)
  *
@@ -95,7 +95,7 @@ enum {
 	L2TP_ATTR_PW_TYPE,		/* u16, enum l2tp_pwtype */
 	L2TP_ATTR_ENCAP_TYPE,		/* u16, enum l2tp_encap_type */
 	L2TP_ATTR_OFFSET,		/* u16 (not used) */
-	L2TP_ATTR_DATA_SEQ,		/* u16 */
+	L2TP_ATTR_DATA_SEQ,		/* u16 (not used) */
 	L2TP_ATTR_L2SPEC_TYPE,		/* u8, enum l2tp_l2spec_type */
 	L2TP_ATTR_L2SPEC_LEN,		/* u8 (not used) */
 	L2TP_ATTR_PROTO_VERSION,	/* u8 */
@@ -169,6 +169,7 @@ enum l2tp_encap_type {
 	L2TP_ENCAPTYPE_IP,
 };
 
+/* For L2TP_ATTR_DATA_SEQ. Unused. */
 enum l2tp_seqmode {
 	L2TP_SEQ_NONE = 0,
 	L2TP_SEQ_IP = 1,
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index d85fde793a8c..7dbfb55ab3b5 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -45,10 +45,6 @@ struct l2tp_tunnel;
  */
 struct l2tp_session_cfg {
 	enum l2tp_pwtype	pw_type;
-	unsigned int		data_seq:2;	/* data sequencing level
-						 * 0 => none, 1 => IP only,
-						 * 2 => all
-						 */
 	unsigned int		recv_seq:1;	/* expect receive packets with
 						 * sequence numbers? */
 	unsigned int		send_seq:1;	/* send packets with sequence
@@ -99,10 +95,6 @@ struct l2tp_session {
 
 	char			name[32];	/* for logging */
 	char			ifname[IFNAMSIZ];
-	unsigned int		data_seq:2;	/* data sequencing level
-						 * 0 => none, 1 => IP only,
-						 * 2 => all
-						 */
 	unsigned int		recv_seq:1;	/* expect receive packets with
 						 * sequence numbers? */
 	unsigned int		send_seq:1;	/* send packets with sequence
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index b5d7dde003ef..91b9248610f0 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -191,12 +191,10 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
 	if (session->send_seq || session->recv_seq)
 		seq_printf(m, "   nr %hu, ns %hu\n", session->nr, session->ns);
 	seq_printf(m, "   refcnt %d\n", refcount_read(&session->ref_count));
-	seq_printf(m, "   config %d/%d/%c/%c/%s/%s %08x %u\n",
+	seq_printf(m, "   config %d/%d/%c/%c/-/%s %08x %u\n",
 		   session->mtu, session->mru,
 		   session->recv_seq ? 'R' : '-',
 		   session->send_seq ? 'S' : '-',
-		   session->data_seq == 1 ? "IPSEQ" :
-		   session->data_seq == 2 ? "DATASEQ" : "-",
 		   session->lns_mode ? "LNS" : "LAC",
 		   session->debug,
 		   jiffies_to_msecs(session->reorder_timeout));
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 5b9900889e31..e4785f6966f6 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -560,9 +560,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 	}
 
 	if (tunnel->version > 2) {
-		if (info->attrs[L2TP_ATTR_DATA_SEQ])
-			cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
-
 		if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) {
 			cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]);
 			if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT &&
@@ -693,9 +690,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
 	if (info->attrs[L2TP_ATTR_DEBUG])
 		session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
 
-	if (info->attrs[L2TP_ATTR_DATA_SEQ])
-		session->data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
-
 	if (info->attrs[L2TP_ATTR_RECV_SEQ])
 		session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
 
-- 
cgit v1.2.3


From ae51a7c6d54876c47ae53c455434023df2c19801 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 27 Jul 2018 10:59:58 +0200
Subject: l2tp: ignore L2TP_ATTR_VLAN_ID netlink attribute

The value of this attribute is never used.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h | 4 ++--
 net/l2tp/l2tp_core.h      | 1 -
 net/l2tp/l2tp_netlink.c   | 3 ---
 3 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index ae888606b3ec..41bf79a4b165 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -60,7 +60,7 @@ struct sockaddr_l2tpip6 {
 /*
  * Commands.
  * Valid TLVs of each command are:-
- * TUNNEL_CREATE	- CONN_ID, pw_type, netns, ifname, ipinfo, udpinfo, udpcsum, vlanid
+ * TUNNEL_CREATE	- CONN_ID, pw_type, netns, ifname, ipinfo, udpinfo, udpcsum
  * TUNNEL_DELETE	- CONN_ID
  * TUNNEL_MODIFY	- CONN_ID, udpcsum
  * TUNNEL_GETSTATS	- CONN_ID, (stats)
@@ -105,7 +105,7 @@ enum {
 	L2TP_ATTR_SESSION_ID,		/* u32 */
 	L2TP_ATTR_PEER_SESSION_ID,	/* u32 */
 	L2TP_ATTR_UDP_CSUM,		/* u8 */
-	L2TP_ATTR_VLAN_ID,		/* u16 */
+	L2TP_ATTR_VLAN_ID,		/* u16 (not used) */
 	L2TP_ATTR_COOKIE,		/* 0, 4 or 8 bytes */
 	L2TP_ATTR_PEER_COOKIE,		/* 0, 4 or 8 bytes */
 	L2TP_ATTR_DEBUG,		/* u32, enum l2tp_debug_flags */
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 7dbfb55ab3b5..49fd5e05538c 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -54,7 +54,6 @@ struct l2tp_session_cfg {
 						 * control of LNS. */
 	int			debug;		/* bitmask of debug message
 						 * categories */
-	u16			vlan_id;	/* VLAN pseudowire only */
 	u16			l2specific_type; /* Layer 2 specific type */
 	u8			cookie[8];	/* optional cookie */
 	int			cookie_len;	/* 0, 4 or 8 bytes */
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index e4785f6966f6..8ea1deefbc37 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -591,9 +591,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 		}
 		if (info->attrs[L2TP_ATTR_IFNAME])
 			cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
-
-		if (info->attrs[L2TP_ATTR_VLAN_ID])
-			cfg.vlan_id = nla_get_u16(info->attrs[L2TP_ATTR_VLAN_ID]);
 	}
 
 	if (info->attrs[L2TP_ATTR_DEBUG])
-- 
cgit v1.2.3


From 92ea4a7eec7289468ac8de5386f4b13d9c210cb5 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 27 Jul 2018 11:00:00 +0200
Subject: l2tp: drop ->mru from struct l2tp_session

This field is not used.

Treat PPPIOC*MRU the same way as PPPIOC*FLAGS: "get" requests return 0,
while "set" requests vadidate the user supplied pointer but discard its
value.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h |  2 +-
 net/l2tp/l2tp_core.c      |  1 -
 net/l2tp/l2tp_core.h      |  2 --
 net/l2tp/l2tp_debugfs.c   |  4 ++--
 net/l2tp/l2tp_netlink.c   | 10 +---------
 net/l2tp/l2tp_ppp.c       | 41 +++++------------------------------------
 6 files changed, 9 insertions(+), 51 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index 41bf79a4b165..8bb8c7cfabe5 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -120,7 +120,7 @@ enum {
 	L2TP_ATTR_UDP_SPORT,		/* u16 */
 	L2TP_ATTR_UDP_DPORT,		/* u16 */
 	L2TP_ATTR_MTU,			/* u16 */
-	L2TP_ATTR_MRU,			/* u16 */
+	L2TP_ATTR_MRU,			/* u16 (not used) */
 	L2TP_ATTR_STATS,		/* nested */
 	L2TP_ATTR_IP6_SADDR,		/* struct in6_addr */
 	L2TP_ATTR_IP6_DADDR,		/* struct in6_addr */
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index d10f4ed52d92..c61a467fd9b8 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1675,7 +1675,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
 			session->pwtype = cfg->pw_type;
 			session->debug = cfg->debug;
 			session->mtu = cfg->mtu;
-			session->mru = cfg->mru;
 			session->send_seq = cfg->send_seq;
 			session->recv_seq = cfg->recv_seq;
 			session->lns_mode = cfg->lns_mode;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 49fd5e05538c..fa5ae9432d38 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -62,7 +62,6 @@ struct l2tp_session_cfg {
 	int			reorder_timeout; /* configured reorder timeout
 						  * (in jiffies) */
 	int			mtu;
-	int			mru;
 	char			*ifname;
 };
 
@@ -107,7 +106,6 @@ struct l2tp_session {
 						  * (in jiffies) */
 	int			reorder_skip;	/* set if skip to next nr */
 	int			mtu;
-	int			mru;
 	enum l2tp_pwtype	pwtype;
 	struct l2tp_stats	stats;
 	struct hlist_node	global_hlist;	/* Global hash list node */
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 91b9248610f0..aee271741f5b 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -191,8 +191,8 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
 	if (session->send_seq || session->recv_seq)
 		seq_printf(m, "   nr %hu, ns %hu\n", session->nr, session->ns);
 	seq_printf(m, "   refcnt %d\n", refcount_read(&session->ref_count));
-	seq_printf(m, "   config %d/%d/%c/%c/-/%s %08x %u\n",
-		   session->mtu, session->mru,
+	seq_printf(m, "   config %d/0/%c/%c/-/%s %08x %u\n",
+		   session->mtu,
 		   session->recv_seq ? 'R' : '-',
 		   session->send_seq ? 'S' : '-',
 		   session->lns_mode ? "LNS" : "LAC",
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 8ea1deefbc37..a7c409215336 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -611,9 +611,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 	if (info->attrs[L2TP_ATTR_MTU])
 		cfg.mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
 
-	if (info->attrs[L2TP_ATTR_MRU])
-		cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
-
 #ifdef CONFIG_MODULES
 	if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) {
 		genl_unlock();
@@ -704,9 +701,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
 	if (info->attrs[L2TP_ATTR_MTU])
 		session->mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
 
-	if (info->attrs[L2TP_ATTR_MRU])
-		session->mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
-
 	ret = l2tp_session_notify(&l2tp_nl_family, info,
 				  session, L2TP_CMD_SESSION_MODIFY);
 
@@ -737,9 +731,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
 			session->peer_session_id) ||
 	    nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) ||
 	    nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype) ||
-	    nla_put_u16(skb, L2TP_ATTR_MTU, session->mtu) ||
-	    (session->mru &&
-	     nla_put_u16(skb, L2TP_ATTR_MRU, session->mru)))
+	    nla_put_u16(skb, L2TP_ATTR_MTU, session->mtu))
 		goto nla_put_failure;
 
 	if ((session->ifname[0] &&
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 759ce8421269..44cac66284a5 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -570,10 +570,9 @@ static void pppol2tp_session_init(struct l2tp_session *session)
 	if (dst) {
 		u32 pmtu = dst_mtu(dst);
 
-		if (pmtu) {
+		if (pmtu)
 			session->mtu = pmtu - PPPOL2TP_HEADER_OVERHEAD;
-			session->mru = pmtu - PPPOL2TP_HEADER_OVERHEAD;
-		}
+
 		dst_release(dst);
 	}
 }
@@ -781,7 +780,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 	} else {
 		/* Default MTU must allow space for UDP/L2TP/PPP headers */
 		cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
-		cfg.mru = cfg.mtu;
 		cfg.pw_type = L2TP_PWTYPE_PPP;
 
 		session = l2tp_session_create(sizeof(struct pppol2tp_session),
@@ -885,8 +883,6 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel,
 	/* Default MTU values. */
 	if (cfg->mtu == 0)
 		cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
-	if (cfg->mru == 0)
-		cfg->mru = cfg->mtu;
 
 	/* Allocate and initialize a new session context. */
 	session = l2tp_session_create(sizeof(struct pppol2tp_session),
@@ -1101,34 +1097,6 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
 		break;
 
 	case PPPIOCGMRU:
-		err = -ENXIO;
-		if (!(sk->sk_state & PPPOX_CONNECTED))
-			break;
-
-		err = -EFAULT;
-		if (put_user(session->mru, (int __user *) arg))
-			break;
-
-		l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mru=%d\n",
-			  session->name, session->mru);
-		err = 0;
-		break;
-
-	case PPPIOCSMRU:
-		err = -ENXIO;
-		if (!(sk->sk_state & PPPOX_CONNECTED))
-			break;
-
-		err = -EFAULT;
-		if (get_user(val, (int __user *) arg))
-			break;
-
-		session->mru = val;
-		l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mru=%d\n",
-			  session->name, session->mru);
-		err = 0;
-		break;
-
 	case PPPIOCGFLAGS:
 		err = -EFAULT;
 		if (put_user(0, (int __user *)arg))
@@ -1136,6 +1104,7 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
 		err = 0;
 		break;
 
+	case PPPIOCSMRU:
 	case PPPIOCSFLAGS:
 		err = -EFAULT;
 		if (get_user(val, (int __user *)arg))
@@ -1723,8 +1692,8 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
 		   tunnel->peer_tunnel_id,
 		   session->peer_session_id,
 		   state, user_data_ok);
-	seq_printf(m, "   %d/%d/%c/%c/%s %08x %u\n",
-		   session->mtu, session->mru,
+	seq_printf(m, "   %d/0/%c/%c/%s %08x %u\n",
+		   session->mtu,
 		   session->recv_seq ? 'R' : '-',
 		   session->send_seq ? 'S' : '-',
 		   session->lns_mode ? "LNS" : "LAC",
-- 
cgit v1.2.3


From 5cbf777cfdf6e5a7b7149006e4881a255da78fdd Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 27 Jul 2018 16:37:28 +0800
Subject: route: add support for directed broadcast forwarding

This patch implements the feature described in rfc1812#section-5.3.5.2
and rfc2644. It allows the router to forward directed broadcast when
sysctl bc_forwarding is enabled.

Note that this feature could be done by iptables -j TEE, but it would
cause some problems:
  - target TEE's gateway param has to be set with a specific address,
    and it's not flexible especially when the route wants forward all
    directed broadcasts.
  - this duplicates the directed broadcasts so this may cause side
    effects to applications.

Besides, to keep consistent with other os router like BSD, it's also
necessary to implement it in the route rx path.

Note that route cache needs to be flushed when bc_forwarding is
changed.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h   |  1 +
 include/uapi/linux/ip.h      |  1 +
 include/uapi/linux/netconf.h |  1 +
 net/ipv4/devinet.c           | 11 +++++++++++
 net/ipv4/route.c             |  6 +++++-
 5 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 27650f1bff3d..c759d1cbcedd 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 
 #define IN_DEV_FORWARD(in_dev)		IN_DEV_CONF_GET((in_dev), FORWARDING)
 #define IN_DEV_MFORWARD(in_dev)		IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
+#define IN_DEV_BFORWARD(in_dev)		IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
 #define IN_DEV_RPFILTER(in_dev)		IN_DEV_MAXCONF((in_dev), RP_FILTER)
 #define IN_DEV_SRC_VMARK(in_dev)    	IN_DEV_ORCONF((in_dev), SRC_VMARK)
 #define IN_DEV_SOURCE_ROUTE(in_dev)	IN_DEV_ANDCONF((in_dev), \
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index b24a742beae5..e42d13b55cf3 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -168,6 +168,7 @@ enum
 	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	IPV4_DEVCONF_DROP_GRATUITOUS_ARP,
+	IPV4_DEVCONF_BC_FORWARDING,
 	__IPV4_DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index c84fcdfca862..fac4edd55379 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -18,6 +18,7 @@ enum {
 	NETCONFA_PROXY_NEIGH,
 	NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 	NETCONFA_INPUT,
+	NETCONFA_BC_FORWARDING,
 	__NETCONFA_MAX
 };
 #define NETCONFA_MAX	(__NETCONFA_MAX - 1)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab1a77a..ea4bd8a52422 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
 		size += nla_total_size(4);
 	if (all || type == NETCONFA_MC_FORWARDING)
 		size += nla_total_size(4);
+	if (all || type == NETCONFA_BC_FORWARDING)
+		size += nla_total_size(4);
 	if (all || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
 	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
@@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
 			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
 		goto nla_put_failure;
+	if ((all || type == NETCONFA_BC_FORWARDING) &&
+	    nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+			IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+		goto nla_put_failure;
 	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
 			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
@@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
 			if ((new_value == 0) && (old_value != 0))
 				rt_cache_flush(net);
 
+		if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
+		    new_value != old_value)
+			rt_cache_flush(net);
+
 		if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
 		    new_value != old_value) {
 			ifindex = devinet_conf_ifindex(net, cnf);
@@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table {
 		DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
 					     devinet_sysctl_forward),
 		DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+		DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
 
 		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
 		DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1df6e97106d7..b678466da451 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		goto no_route;
 	}
 
-	if (res->type == RTN_BROADCAST)
+	if (res->type == RTN_BROADCAST) {
+		if (IN_DEV_BFORWARD(in_dev))
+			goto make_route;
 		goto brd_input;
+	}
 
 	if (res->type == RTN_LOCAL) {
 		err = fib_validate_source(skb, saddr, daddr, tos,
@@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (res->type != RTN_UNICAST)
 		goto martian_destination;
 
+make_route:
 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
 out:	return err;
 
-- 
cgit v1.2.3


From 4b09384aaa2a9b2ac09a584d7a9345cf003617f2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 27 Jul 2018 13:11:00 -0700
Subject: net: dcb: add DSCP to comment about priority selector types

Commit ee2059819450 ("net/dcb: Add dscp to priority selector type")
added a define for the new DSCP selector type created by
IEEE 802.1Qcd, but missed the comment enumerating all selector types.
Update the comment.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/dcbnl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h
index 60aa2e446698..69df19aa8e72 100644
--- a/include/uapi/linux/dcbnl.h
+++ b/include/uapi/linux/dcbnl.h
@@ -233,7 +233,8 @@ struct cee_pfc {
  *	2	Well known port number over TCP or SCTP
  *	3	Well known port number over UDP or DCCP
  *	4	Well known port number over TCP, SCTP, UDP, or DCCP
- *	5-7	Reserved
+ *	5	Differentiated Services Code Point (DSCP) value
+ *	6-7	Reserved
  *
  *  Selector field values for CEE
  *	0	Ethertype
-- 
cgit v1.2.3


From 3e7a50ceb11ea75c27e944f1a01e478fd62a2d8d Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 27 Jul 2018 13:43:22 -0700
Subject: net: report min and max mtu network device settings

Report the minimum and maximum MTU allowed on a device
via netlink so that it can be displayed by tools like
ip link.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 2 ++
 net/core/rtnetlink.c         | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 553c438cabe3..43391e2d1153 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -164,6 +164,8 @@ enum {
 	IFLA_CARRIER_UP_COUNT,
 	IFLA_CARRIER_DOWN_COUNT,
 	IFLA_NEW_IFINDEX,
+	IFLA_MIN_MTU,
+	IFLA_MAX_MTU,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 92b6fa5d5f6e..510d4f765a13 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1015,6 +1015,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4)  /* IFLA_IF_NETNSID */
 	       + nla_total_size(4)  /* IFLA_CARRIER_UP_COUNT */
 	       + nla_total_size(4)  /* IFLA_CARRIER_DOWN_COUNT */
+	       + nla_total_size(4)  /* IFLA_MIN_MTU */
+	       + nla_total_size(4)  /* IFLA_MAX_MTU */
 	       + 0;
 }
 
@@ -1601,6 +1603,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 		       netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
 	    nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
 	    nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+	    nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) ||
+	    nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
 	    nla_put_u32(skb, IFLA_GROUP, dev->group) ||
 	    nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
 	    nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
@@ -1732,6 +1736,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_IF_NETNSID]	= { .type = NLA_S32 },
 	[IFLA_CARRIER_UP_COUNT]	= { .type = NLA_U32 },
 	[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
+	[IFLA_MIN_MTU]		= { .type = NLA_U32 },
+	[IFLA_MAX_MTU]		= { .type = NLA_U32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
-- 
cgit v1.2.3


From f9324952088f1cd62ea4addf9ff532f1e6452a22 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Wed, 25 Jul 2018 01:32:45 +0200
Subject: netfilter: nfnetlink_osf: extract nfnetlink_subsystem code from
 xt_osf.c

Move nfnetlink osf subsystem from xt_osf.c to standalone module so we can
reuse it from the new nft_ost extension.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_osf.h |  11 +++
 include/uapi/linux/netfilter/xt_osf.h |   9 +-
 net/netfilter/nfnetlink_osf.c         | 154 ++++++++++++++++++++++++++++++++++
 net/netfilter/xt_osf.c                | 149 +-------------------------------
 4 files changed, 169 insertions(+), 154 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
index 3738116b2bbe..cc2487ff74f6 100644
--- a/include/uapi/linux/netfilter/nf_osf.h
+++ b/include/uapi/linux/netfilter/nf_osf.h
@@ -70,6 +70,8 @@ struct nf_osf_nlmsg {
 	struct tcphdr			tcp;
 };
 
+extern struct list_head nf_osf_fingers[2];
+
 /* Defines for IANA option kinds */
 enum iana_options {
 	OSFOPT_EOL = 0,		/* End of options */
@@ -94,4 +96,13 @@ enum nf_osf_attr_type {
 	OSF_ATTR_MAX,
 };
 
+/*
+ * Add/remove fingerprint from the kernel.
+ */
+enum nf_osf_msg_types {
+	OSF_MSG_ADD,
+	OSF_MSG_REMOVE,
+	OSF_MSG_MAX,
+};
+
 #endif /* _NF_OSF_H */
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index b189007f4f28..a90e90c27cef 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -47,13 +47,6 @@
 #define xt_osf_nlmsg		nf_osf_nlmsg
 
 #define xt_osf_attr_type	nf_osf_attr_type
-/*
- * Add/remove fingerprint from the kernel.
- */
-enum xt_osf_msg_types {
-	OSF_MSG_ADD,
-	OSF_MSG_REMOVE,
-	OSF_MSG_MAX,
-};
+#define xt_osf_msg_types	nf_osf_msg_types
 
 #endif				/* _XT_OSF_H */
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index f4c75e982902..ba0fa11869ce 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -20,6 +20,13 @@
 #include <net/netfilter/nf_log.h>
 #include <linux/netfilter/nf_osf.h>
 
+/*
+ * Indexed by dont-fragment bit.
+ * It is the only constant value in the fingerprint.
+ */
+struct list_head nf_osf_fingers[2];
+EXPORT_SYMBOL_GPL(nf_osf_fingers);
+
 static inline int nf_osf_ttl(const struct sk_buff *skb,
 			     int ttl_check, unsigned char f_ttl)
 {
@@ -279,4 +286,151 @@ const char *nf_osf_find(const struct sk_buff *skb,
 }
 EXPORT_SYMBOL_GPL(nf_osf_find);
 
+static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = {
+	[OSF_ATTR_FINGER]	= { .len = sizeof(struct nf_osf_user_finger) },
+};
+
+static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
+				 struct sk_buff *skb, const struct nlmsghdr *nlh,
+				 const struct nlattr * const osf_attrs[],
+				 struct netlink_ext_ack *extack)
+{
+	struct nf_osf_user_finger *f;
+	struct nf_osf_finger *kf = NULL, *sf;
+	int err = 0;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!osf_attrs[OSF_ATTR_FINGER])
+		return -EINVAL;
+
+	if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+		return -EINVAL;
+
+	f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+	kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL);
+	if (!kf)
+		return -ENOMEM;
+
+	memcpy(&kf->finger, f, sizeof(struct nf_osf_user_finger));
+
+	list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) {
+		if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger)))
+			continue;
+
+		kfree(kf);
+		kf = NULL;
+
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			err = -EEXIST;
+		break;
+	}
+
+	/*
+	 * We are protected by nfnl mutex.
+	 */
+	if (kf)
+		list_add_tail_rcu(&kf->finger_entry, &nf_osf_fingers[!!f->df]);
+
+	return err;
+}
+
+static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl,
+				    struct sk_buff *skb,
+				    const struct nlmsghdr *nlh,
+				    const struct nlattr * const osf_attrs[],
+				    struct netlink_ext_ack *extack)
+{
+	struct nf_osf_user_finger *f;
+	struct nf_osf_finger *sf;
+	int err = -ENOENT;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!osf_attrs[OSF_ATTR_FINGER])
+		return -EINVAL;
+
+	f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+	list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) {
+		if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger)))
+			continue;
+
+		/*
+		 * We are protected by nfnl mutex.
+		 */
+		list_del_rcu(&sf->finger_entry);
+		kfree_rcu(sf, rcu_head);
+
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = {
+	[OSF_MSG_ADD]	= {
+		.call		= nfnl_osf_add_callback,
+		.attr_count	= OSF_ATTR_MAX,
+		.policy		= nfnl_osf_policy,
+	},
+	[OSF_MSG_REMOVE]	= {
+		.call		= nfnl_osf_remove_callback,
+		.attr_count	= OSF_ATTR_MAX,
+		.policy		= nfnl_osf_policy,
+	},
+};
+
+static const struct nfnetlink_subsystem nfnl_osf_subsys = {
+	.name			= "osf",
+	.subsys_id		= NFNL_SUBSYS_OSF,
+	.cb_count		= OSF_MSG_MAX,
+	.cb			= nfnl_osf_callbacks,
+};
+
+static int __init nfnl_osf_init(void)
+{
+	int err = -EINVAL;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i)
+		INIT_LIST_HEAD(&nf_osf_fingers[i]);
+
+	err = nfnetlink_subsys_register(&nfnl_osf_subsys);
+	if (err < 0) {
+		pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err);
+		goto err_out_exit;
+	}
+	return 0;
+
+err_out_exit:
+	return err;
+}
+
+static void __exit nfnl_osf_fini(void)
+{
+	struct nf_osf_finger *f;
+	int i;
+
+	nfnetlink_subsys_unregister(&nfnl_osf_subsys);
+
+	rcu_read_lock();
+	for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) {
+		list_for_each_entry_rcu(f, &nf_osf_fingers[i], finger_entry) {
+			list_del_rcu(&f->finger_entry);
+			kfree_rcu(f, rcu_head);
+		}
+	}
+	rcu_read_unlock();
+
+	rcu_barrier();
+}
+
+module_init(nfnl_osf_init);
+module_exit(nfnl_osf_fini);
+
 MODULE_LICENSE("GPL");
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 9cfef73b4107..bf7bba80e24c 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -37,118 +37,6 @@
 #include <net/netfilter/nf_log.h>
 #include <linux/netfilter/xt_osf.h>
 
-/*
- * Indexed by dont-fragment bit.
- * It is the only constant value in the fingerprint.
- */
-static struct list_head xt_osf_fingers[2];
-
-static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = {
-	[OSF_ATTR_FINGER]	= { .len = sizeof(struct xt_osf_user_finger) },
-};
-
-static int xt_osf_add_callback(struct net *net, struct sock *ctnl,
-			       struct sk_buff *skb, const struct nlmsghdr *nlh,
-			       const struct nlattr * const osf_attrs[],
-			       struct netlink_ext_ack *extack)
-{
-	struct xt_osf_user_finger *f;
-	struct xt_osf_finger *kf = NULL, *sf;
-	int err = 0;
-
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (!osf_attrs[OSF_ATTR_FINGER])
-		return -EINVAL;
-
-	if (!(nlh->nlmsg_flags & NLM_F_CREATE))
-		return -EINVAL;
-
-	f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
-
-	kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL);
-	if (!kf)
-		return -ENOMEM;
-
-	memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger));
-
-	list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
-		if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
-			continue;
-
-		kfree(kf);
-		kf = NULL;
-
-		if (nlh->nlmsg_flags & NLM_F_EXCL)
-			err = -EEXIST;
-		break;
-	}
-
-	/*
-	 * We are protected by nfnl mutex.
-	 */
-	if (kf)
-		list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]);
-
-	return err;
-}
-
-static int xt_osf_remove_callback(struct net *net, struct sock *ctnl,
-				  struct sk_buff *skb,
-				  const struct nlmsghdr *nlh,
-				  const struct nlattr * const osf_attrs[],
-				  struct netlink_ext_ack *extack)
-{
-	struct xt_osf_user_finger *f;
-	struct xt_osf_finger *sf;
-	int err = -ENOENT;
-
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (!osf_attrs[OSF_ATTR_FINGER])
-		return -EINVAL;
-
-	f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
-
-	list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
-		if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
-			continue;
-
-		/*
-		 * We are protected by nfnl mutex.
-		 */
-		list_del_rcu(&sf->finger_entry);
-		kfree_rcu(sf, rcu_head);
-
-		err = 0;
-		break;
-	}
-
-	return err;
-}
-
-static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = {
-	[OSF_MSG_ADD]	= {
-		.call		= xt_osf_add_callback,
-		.attr_count	= OSF_ATTR_MAX,
-		.policy		= xt_osf_policy,
-	},
-	[OSF_MSG_REMOVE]	= {
-		.call		= xt_osf_remove_callback,
-		.attr_count	= OSF_ATTR_MAX,
-		.policy		= xt_osf_policy,
-	},
-};
-
-static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
-	.name			= "osf",
-	.subsys_id		= NFNL_SUBSYS_OSF,
-	.cb_count		= OSF_MSG_MAX,
-	.cb			= xt_osf_nfnetlink_callbacks,
-};
-
 static bool
 xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 {
@@ -159,7 +47,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 		return false;
 
 	return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
-			    xt_out(p), info, net, xt_osf_fingers);
+			    xt_out(p), info, net, nf_osf_fingers);
 }
 
 static struct xt_match xt_osf_match = {
@@ -177,52 +65,21 @@ static struct xt_match xt_osf_match = {
 
 static int __init xt_osf_init(void)
 {
-	int err = -EINVAL;
-	int i;
-
-	for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i)
-		INIT_LIST_HEAD(&xt_osf_fingers[i]);
-
-	err = nfnetlink_subsys_register(&xt_osf_nfnetlink);
-	if (err < 0) {
-		pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err);
-		goto err_out_exit;
-	}
+	int err;
 
 	err = xt_register_match(&xt_osf_match);
 	if (err) {
 		pr_err("Failed to register OS fingerprint "
 		       "matching module (%d)\n", err);
-		goto err_out_remove;
+		return err;
 	}
 
 	return 0;
-
-err_out_remove:
-	nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
-err_out_exit:
-	return err;
 }
 
 static void __exit xt_osf_fini(void)
 {
-	struct xt_osf_finger *f;
-	int i;
-
-	nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
 	xt_unregister_match(&xt_osf_match);
-
-	rcu_read_lock();
-	for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) {
-
-		list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) {
-			list_del_rcu(&f->finger_entry);
-			kfree_rcu(f, rcu_head);
-		}
-	}
-	rcu_read_unlock();
-
-	rcu_barrier();
 }
 
 module_init(xt_osf_init);
-- 
cgit v1.2.3


From b96af92d6eaf9fadd77aa798c508a8a9d2e60020 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Wed, 25 Jul 2018 01:32:46 +0200
Subject: netfilter: nf_tables: implement Passive OS fingerprint module in
 nft_osf

Add basic module functions into nft_osf.[ch] in order to implement OSF
module in nf_tables.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   7 ++
 net/netfilter/Kconfig                    |   7 ++
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_osf.c                  | 106 +++++++++++++++++++++++++++++++
 4 files changed, 121 insertions(+)
 create mode 100644 net/netfilter/nft_osf.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index f466860bcf75..382c32d630e9 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1463,6 +1463,13 @@ enum nft_flowtable_hook_attributes {
 };
 #define NFTA_FLOWTABLE_HOOK_MAX	(__NFTA_FLOWTABLE_HOOK_MAX - 1)
 
+enum nft_osf_attributes {
+	NFTA_OSF_UNSPEC,
+	NFTA_OSF_DREG,
+	__NFTA_OSF_MAX,
+};
+#define NFTA_OSF_MAX (__NFTA_OSF_MAX - 1)
+
 /**
  * enum nft_device_attributes - nf_tables device netlink attributes
  *
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 85333431e524..16fdfb75efb5 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -627,6 +627,13 @@ config NFT_SOCKET
 	  This option allows matching for the presence or absence of a
 	  corresponding socket and its attributes.
 
+config NFT_OSF
+	tristate "Netfilter nf_tables passive OS fingerprint support"
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_NETLINK_OSF
+	help
+	  This option allows matching packets from an specific OS.
+
 if NF_TABLES_NETDEV
 
 config NF_DUP_NETDEV
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index e684f9b8a9c3..5cbbf6978b55 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -110,6 +110,7 @@ obj-$(CONFIG_NFT_FIB)		+= nft_fib.o
 obj-$(CONFIG_NFT_FIB_INET)	+= nft_fib_inet.o
 obj-$(CONFIG_NFT_FIB_NETDEV)	+= nft_fib_netdev.o
 obj-$(CONFIG_NFT_SOCKET)	+= nft_socket.o
+obj-$(CONFIG_NFT_OSF)		+= nft_osf.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
new file mode 100644
index 000000000000..bdacc4cffba4
--- /dev/null
+++ b/net/netfilter/nft_osf.c
@@ -0,0 +1,106 @@
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_tables.h>
+#include <linux/netfilter/nf_osf.h>
+
+#define OSF_GENRE_SIZE		32
+
+struct nft_osf {
+	enum nft_registers	dreg:8;
+};
+
+static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = {
+	[NFTA_OSF_DREG]		= { .type = NLA_U32 },
+};
+
+static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
+			 const struct nft_pktinfo *pkt)
+{
+	struct nft_osf *priv = nft_expr_priv(expr);
+	u32 *dest = &regs->data[priv->dreg];
+	struct sk_buff *skb = pkt->skb;
+	const struct tcphdr *tcp;
+	struct tcphdr _tcph;
+	const char *os_name;
+
+	tcp = skb_header_pointer(skb, ip_hdrlen(skb),
+				 sizeof(struct tcphdr), &_tcph);
+	if (!tcp) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+	if (!tcp->syn) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	os_name = nf_osf_find(skb, nf_osf_fingers);
+	if (!os_name)
+		strncpy((char *)dest, "unknown", IFNAMSIZ);
+	else
+		strncpy((char *)dest, os_name, IFNAMSIZ);
+}
+
+static int nft_osf_init(const struct nft_ctx *ctx,
+			const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_osf *priv = nft_expr_priv(expr);
+	int err;
+
+	priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
+	err = nft_validate_register_store(ctx, priv->dreg, NULL,
+					  NFTA_DATA_VALUE, OSF_GENRE_SIZE);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_osf *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_osf_type;
+static const struct nft_expr_ops nft_osf_op = {
+	.eval		= nft_osf_eval,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_osf)),
+	.init		= nft_osf_init,
+	.dump		= nft_osf_dump,
+	.type		= &nft_osf_type,
+};
+
+static struct nft_expr_type nft_osf_type __read_mostly = {
+	.ops		= &nft_osf_op,
+	.name		= "osf",
+	.owner		= THIS_MODULE,
+	.policy		= nft_osf_policy,
+	.maxattr	= NFTA_OSF_MAX,
+};
+
+static int __init nft_osf_module_init(void)
+{
+	return nft_register_expr(&nft_osf_type);
+}
+
+static void __exit nft_osf_module_exit(void)
+{
+	return nft_unregister_expr(&nft_osf_type);
+}
+
+module_init(nft_osf_module_init);
+module_exit(nft_osf_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
+MODULE_ALIAS_NFT_EXPR("osf");
-- 
cgit v1.2.3


From 4ed8eb6570a49931c705512060acd50058d61616 Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Mon, 30 Jul 2018 11:07:32 +0200
Subject: netfilter: nf_tables: Add native tproxy support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A great portion of the code is taken from xt_TPROXY.c

There are some changes compared to the iptables implementation:
 - tproxy statement is not terminal here
 - Either address or port has to be specified, but at least one of them
   is necessary. If one of them is not specified, the evaluation will be
   performed with the original attribute of the packet (ie. target port
   is not specified => the packet's dport will be used).

To make this work in inet tables, the tproxy structure has a family
member (typically called priv->family) which is not necessarily equal to
ctx->family.

priv->family can have three values legally:
 - NFPROTO_IPV4 if the table family is ip OR if table family is inet,
   but an ipv4 address is specified as a target address. The rule only
   evaluates ipv4 packets in this case.
 - NFPROTO_IPV6 if the table family is ip6 OR if table family is inet,
   but an ipv6 address is specified as a target address. The rule only
   evaluates ipv6 packets in this case.
 - NFPROTO_UNSPEC if the table family is inet AND if only the port is
   specified. The rule will evaluate both ipv4 and ipv6 packets.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  16 ++
 net/netfilter/Kconfig                    |  10 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_tproxy.c               | 314 +++++++++++++++++++++++++++++++
 4 files changed, 341 insertions(+)
 create mode 100644 net/netfilter/nft_tproxy.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 382c32d630e9..f112ea52dc1a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1252,6 +1252,22 @@ enum nft_nat_attributes {
 };
 #define NFTA_NAT_MAX		(__NFTA_NAT_MAX - 1)
 
+/**
+ * enum nft_tproxy_attributes - nf_tables tproxy expression netlink attributes
+ *
+ * NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers)
+ * NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers)
+ * NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers)
+ */
+enum nft_tproxy_attributes {
+	NFTA_TPROXY_UNSPEC,
+	NFTA_TPROXY_FAMILY,
+	NFTA_TPROXY_REG_ADDR,
+	NFTA_TPROXY_REG_PORT,
+	__NFTA_TPROXY_MAX
+};
+#define NFTA_TPROXY_MAX		(__NFTA_TPROXY_MAX - 1)
+
 /**
  * enum nft_masq_attributes - nf_tables masquerade expression attributes
  *
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 16fdfb75efb5..0febf3e21f91 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -634,6 +634,16 @@ config NFT_OSF
 	help
 	  This option allows matching packets from an specific OS.
 
+config NFT_TPROXY
+	tristate "Netfilter nf_tables tproxy support"
+	depends on IPV6 || IPV6=n
+	select NF_DEFRAG_IPV4
+	select NF_DEFRAG_IPV6 if NF_TABLES_IPV6
+	select NF_TPROXY_IPV4
+	select NF_TPROXY_IPV6 if NF_TABLES_IPV6
+	help
+	  This makes transparent proxy support available in nftables.
+
 if NF_TABLES_NETDEV
 
 config NF_DUP_NETDEV
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 5cbbf6978b55..cf61615cc529 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -111,6 +111,7 @@ obj-$(CONFIG_NFT_FIB_INET)	+= nft_fib_inet.o
 obj-$(CONFIG_NFT_FIB_NETDEV)	+= nft_fib_netdev.o
 obj-$(CONFIG_NFT_SOCKET)	+= nft_socket.o
 obj-$(CONFIG_NFT_OSF)		+= nft_osf.o
+obj-$(CONFIG_NFT_TPROXY)	+= nft_tproxy.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
new file mode 100644
index 000000000000..c6845f7baa08
--- /dev/null
+++ b/net/netfilter/nft_tproxy.c
@@ -0,0 +1,314 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tproxy.h>
+#include <net/inet_sock.h>
+#include <net/tcp.h>
+#include <linux/if_ether.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
+struct nft_tproxy {
+	enum nft_registers      sreg_addr:8;
+	enum nft_registers      sreg_port:8;
+	u8			family;
+};
+
+static void nft_tproxy_eval_v4(const struct nft_expr *expr,
+			       struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
+{
+	const struct nft_tproxy *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	const struct iphdr *iph = ip_hdr(skb);
+	struct udphdr _hdr, *hp;
+	__be32 taddr = 0;
+	__be16 tport = 0;
+	struct sock *sk;
+
+	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+	if (!hp) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	/* check if there's an ongoing connection on the packet addresses, this
+	 * happens if the redirect already happened and the current packet
+	 * belongs to an already established connection
+	 */
+	sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol,
+				   iph->saddr, iph->daddr,
+				   hp->source, hp->dest,
+				   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
+
+	if (priv->sreg_addr)
+		taddr = regs->data[priv->sreg_addr];
+	taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
+
+	if (priv->sreg_port)
+		tport = regs->data[priv->sreg_port];
+	if (!tport)
+		tport = hp->dest;
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT) {
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = nf_tproxy_handle_time_wait4(nft_net(pkt), skb, taddr, tport, sk);
+	} else if (!sk) {
+		/* no, there's no established connection, check if
+		 * there's a listener on the redirected addr/port
+		 */
+		sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol,
+					   iph->saddr, taddr,
+					   hp->source, tport,
+					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+	}
+
+	if (sk && nf_tproxy_sk_is_transparent(sk))
+		nf_tproxy_assign_sock(skb, sk);
+	else
+		regs->verdict.code = NFT_BREAK;
+}
+
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+static void nft_tproxy_eval_v6(const struct nft_expr *expr,
+			       struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
+{
+	const struct nft_tproxy *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct in6_addr taddr = {0};
+	int thoff = pkt->xt.thoff;
+	struct udphdr _hdr, *hp;
+	__be16 tport = 0;
+	struct sock *sk;
+	int l4proto;
+
+	if (!pkt->tprot_set) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+	l4proto = pkt->tprot;
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	/* check if there's an ongoing connection on the packet addresses, this
+	 * happens if the redirect already happened and the current packet
+	 * belongs to an already established connection
+	 */
+	sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, l4proto,
+				   &iph->saddr, &iph->daddr,
+				   hp->source, hp->dest,
+				   nft_in(pkt), NF_TPROXY_LOOKUP_ESTABLISHED);
+
+	if (priv->sreg_addr)
+		memcpy(&taddr, &regs->data[priv->sreg_addr], sizeof(taddr));
+	taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr);
+
+	if (priv->sreg_port)
+		tport = regs->data[priv->sreg_port];
+	if (!tport)
+		tport = hp->dest;
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT) {
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = nf_tproxy_handle_time_wait6(skb, l4proto, thoff,
+						 nft_net(pkt),
+						 &taddr,
+						 tport,
+						 sk);
+	} else if (!sk) {
+		/* no there's no established connection, check if
+		 * there's a listener on the redirected addr/port
+		 */
+		sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff,
+					   l4proto, &iph->saddr, &taddr,
+					   hp->source, tport,
+					   nft_in(pkt), NF_TPROXY_LOOKUP_LISTENER);
+	}
+
+	/* NOTE: assign_sock consumes our sk reference */
+	if (sk && nf_tproxy_sk_is_transparent(sk))
+		nf_tproxy_assign_sock(skb, sk);
+	else
+		regs->verdict.code = NFT_BREAK;
+}
+#endif
+
+static void nft_tproxy_eval(const struct nft_expr *expr,
+			    struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt)
+{
+	const struct nft_tproxy *priv = nft_expr_priv(expr);
+
+	switch (nft_pf(pkt)) {
+	case NFPROTO_IPV4:
+		switch (priv->family) {
+		case NFPROTO_IPV4:
+		case NFPROTO_UNSPEC:
+			nft_tproxy_eval_v4(expr, regs, pkt);
+			return;
+		}
+		break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+	case NFPROTO_IPV6:
+		switch (priv->family) {
+		case NFPROTO_IPV6:
+		case NFPROTO_UNSPEC:
+			nft_tproxy_eval_v6(expr, regs, pkt);
+			return;
+		}
+#endif
+	}
+	regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = {
+	[NFTA_TPROXY_FAMILY]   = { .type = NLA_U32 },
+	[NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 },
+	[NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 },
+};
+
+static int nft_tproxy_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr * const tb[])
+{
+	struct nft_tproxy *priv = nft_expr_priv(expr);
+	unsigned int alen = 0;
+	int err;
+
+	if (!tb[NFTA_TPROXY_FAMILY] ||
+	    (!tb[NFTA_TPROXY_REG_ADDR] && !tb[NFTA_TPROXY_REG_PORT]))
+		return -EINVAL;
+
+	priv->family = ntohl(nla_get_be32(tb[NFTA_TPROXY_FAMILY]));
+
+	switch (ctx->family) {
+	case NFPROTO_IPV4:
+		if (priv->family != NFPROTO_IPV4)
+			return -EINVAL;
+		break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+	case NFPROTO_IPV6:
+		if (priv->family != NFPROTO_IPV6)
+			return -EINVAL;
+		break;
+#endif
+	case NFPROTO_INET:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	/* Address is specified but the rule family is not set accordingly */
+	if (priv->family == NFPROTO_UNSPEC && tb[NFTA_TPROXY_REG_ADDR])
+		return -EINVAL;
+
+	switch (priv->family) {
+	case NFPROTO_IPV4:
+		alen = FIELD_SIZEOF(union nf_inet_addr, in);
+		err = nf_defrag_ipv4_enable(ctx->net);
+		if (err)
+			return err;
+		break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+	case NFPROTO_IPV6:
+		alen = FIELD_SIZEOF(union nf_inet_addr, in6);
+		err = nf_defrag_ipv6_enable(ctx->net);
+		if (err)
+			return err;
+		break;
+#endif
+	case NFPROTO_UNSPEC:
+		/* No address is specified here */
+		err = nf_defrag_ipv4_enable(ctx->net);
+		if (err)
+			return err;
+		err = nf_defrag_ipv6_enable(ctx->net);
+		if (err)
+			return err;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if (tb[NFTA_TPROXY_REG_ADDR]) {
+		priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]);
+		err = nft_validate_register_load(priv->sreg_addr, alen);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_TPROXY_REG_PORT]) {
+		priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]);
+		err = nft_validate_register_load(priv->sreg_port, sizeof(u16));
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
+static int nft_tproxy_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr)
+{
+	const struct nft_tproxy *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_TPROXY_FAMILY, htonl(priv->family)))
+		return -1;
+
+	if (priv->sreg_addr &&
+	    nft_dump_register(skb, NFTA_TPROXY_REG_ADDR, priv->sreg_addr))
+		return -1;
+
+	if (priv->sreg_port &&
+	    nft_dump_register(skb, NFTA_TPROXY_REG_PORT, priv->sreg_port))
+			return -1;
+
+	return 0;
+}
+
+static struct nft_expr_type nft_tproxy_type;
+static const struct nft_expr_ops nft_tproxy_ops = {
+	.type		= &nft_tproxy_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_tproxy)),
+	.eval		= nft_tproxy_eval,
+	.init		= nft_tproxy_init,
+	.dump		= nft_tproxy_dump,
+};
+
+static struct nft_expr_type nft_tproxy_type __read_mostly = {
+	.name		= "tproxy",
+	.ops		= &nft_tproxy_ops,
+	.policy		= nft_tproxy_policy,
+	.maxattr	= NFTA_TPROXY_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_tproxy_module_init(void)
+{
+	return nft_register_expr(&nft_tproxy_type);
+}
+
+static void __exit nft_tproxy_module_exit(void)
+{
+	nft_unregister_expr(&nft_tproxy_type);
+}
+
+module_init(nft_tproxy_module_init);
+module_exit(nft_tproxy_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Máté Eckl");
+MODULE_DESCRIPTION("nf_tables tproxy support module");
+MODULE_ALIAS_NFT_EXPR("tproxy");
-- 
cgit v1.2.3


From ea8532daee31bc72abfbc9ca7a43cbec0f6c05af Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Wed, 30 May 2018 11:07:05 -0400
Subject: media: videodev2: get rid of VIDIOC_RESERVED

While this ioctl is there at least since Kernel 2.6.12-rc2, it
was never used by any upstream driver.

Get rid of it.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/videodev2.h.rst.exceptions | 1 -
 include/uapi/linux/videodev2.h                 | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/videodev2.h.rst.exceptions b/Documentation/media/videodev2.h.rst.exceptions
index a5cb0a8686ac..ca9f0edc579e 100644
--- a/Documentation/media/videodev2.h.rst.exceptions
+++ b/Documentation/media/videodev2.h.rst.exceptions
@@ -517,7 +517,6 @@ ignore define V4L2_CTRL_WHICH_DEF_VAL
 ignore define V4L2_OUT_CAP_CUSTOM_TIMINGS
 ignore define V4L2_CID_MAX_CTRLS
 
-ignore ioctl VIDIOC_RESERVED
 ignore define BASE_VIDIOC_PRIVATE
 
 # Associate ioctls with their counterparts
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 3ea8097c2470..d8b33095abe0 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -2311,7 +2311,6 @@ struct v4l2_create_buffers {
  *
  */
 #define VIDIOC_QUERYCAP		 _IOR('V',  0, struct v4l2_capability)
-#define VIDIOC_RESERVED		  _IO('V',  1)
 #define VIDIOC_ENUM_FMT         _IOWR('V',  2, struct v4l2_fmtdesc)
 #define VIDIOC_G_FMT		_IOWR('V',  4, struct v4l2_format)
 #define VIDIOC_S_FMT		_IOWR('V',  5, struct v4l2_format)
-- 
cgit v1.2.3


From 802bfb19152c0fb4137c6ba72bcf042ee023e743 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 30 Jul 2018 14:30:42 +0200
Subject: net/sched: user-space can't set unknown tcfa_action values

Currently, when initializing an action, the user-space can specify
and use arbitrary values for the tcfa_action field. If the value
is unknown by the kernel, is implicitly threaded as TC_ACT_UNSPEC.

This change explicitly checks for unknown values at action creation
time, and explicitly convert them to TC_ACT_UNSPEC. No functional
changes are introduced, but this will allow introducing tcfa_action
values not exposed to user-space in a later patch.

Note: we can't use the above to hide TC_ACT_REDIRECT from user-space,
as the latter is already part of uAPI.

v3 -> v4:
 - use an helper to check for action validity (JiriP)
 - emit an extack for invalid actions (JiriP)
v4 -> v5:
 - keep messages on a single line, drop net_warn (Marcelo)

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  6 ++++--
 net/sched/act_api.c          | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index b4512254036b..48e5b5d49a34 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -45,6 +45,7 @@ enum {
 				   * the skb and act like everything
 				   * is alright.
 				   */
+#define TC_ACT_VALUE_MAX	TC_ACT_TRAP
 
 /* There is a special kind of actions called "extended actions",
  * which need a value parameter. These have a local opcode located in
@@ -55,11 +56,12 @@ enum {
 #define __TC_ACT_EXT_SHIFT 28
 #define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT)
 #define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1)
-#define TC_ACT_EXT_CMP(combined, opcode) \
-	(((combined) & (~TC_ACT_EXT_VAL_MASK)) == opcode)
+#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK))
+#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode)
 
 #define TC_ACT_JUMP __TC_ACT_EXT(1)
 #define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2)
+#define TC_ACT_EXT_OPCODE_MAX	TC_ACT_GOTO_CHAIN
 
 /* Action type identifiers*/
 enum {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index b43df1e25c6d..229d63c99be2 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -786,6 +786,15 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
 	return c;
 }
 
+static bool tcf_action_valid(int action)
+{
+	int opcode = TC_ACT_EXT_OPCODE(action);
+
+	if (!opcode)
+		return action <= TC_ACT_VALUE_MAX;
+	return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC;
+}
+
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    struct nlattr *nla, struct nlattr *est,
 				    char *name, int ovr, int bind,
@@ -895,6 +904,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 		}
 	}
 
+	if (!tcf_action_valid(a->tcfa_action)) {
+		NL_SET_ERR_MSG(extack, "invalid action value, using TC_ACT_UNSPEC instead");
+		a->tcfa_action = TC_ACT_UNSPEC;
+	}
+
 	return a;
 
 err_mod:
-- 
cgit v1.2.3


From b41e44b4cb230747d7ad56e38c9dc65369b1d381 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Wed, 30 May 2018 11:07:03 -0400
Subject: media: dvb/video.h: get rid of unused APIs

There are a number of other ioctls that aren't used anywhere
inside the Kernel tree.

Get rid of them.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 .../media/uapi/dvb/video-get-frame-rate.rst        |  61 -----------
 Documentation/media/uapi/dvb/video-get-navi.rst    |  84 ---------------
 .../media/uapi/dvb/video-set-attributes.rst        |  93 -----------------
 .../media/uapi/dvb/video-set-highlight.rst         |  86 ----------------
 Documentation/media/uapi/dvb/video-set-id.rst      |  75 --------------
 Documentation/media/uapi/dvb/video-set-spu.rst     |  85 ----------------
 Documentation/media/uapi/dvb/video-set-system.rst  |  77 --------------
 .../media/uapi/dvb/video_function_calls.rst        |   6 --
 Documentation/media/uapi/dvb/video_types.rst       | 113 ---------------------
 Documentation/media/video.h.rst.exceptions         |   2 -
 fs/compat_ioctl.c                                  |   7 --
 include/uapi/linux/dvb/video.h                     |  51 ----------
 12 files changed, 740 deletions(-)
 delete mode 100644 Documentation/media/uapi/dvb/video-get-frame-rate.rst
 delete mode 100644 Documentation/media/uapi/dvb/video-get-navi.rst
 delete mode 100644 Documentation/media/uapi/dvb/video-set-attributes.rst
 delete mode 100644 Documentation/media/uapi/dvb/video-set-highlight.rst
 delete mode 100644 Documentation/media/uapi/dvb/video-set-id.rst
 delete mode 100644 Documentation/media/uapi/dvb/video-set-spu.rst
 delete mode 100644 Documentation/media/uapi/dvb/video-set-system.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/dvb/video-get-frame-rate.rst b/Documentation/media/uapi/dvb/video-get-frame-rate.rst
deleted file mode 100644
index 400042a854cf..000000000000
--- a/Documentation/media/uapi/dvb/video-get-frame-rate.rst
+++ /dev/null
@@ -1,61 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _VIDEO_GET_FRAME_RATE:
-
-====================
-VIDEO_GET_FRAME_RATE
-====================
-
-Name
-----
-
-VIDEO_GET_FRAME_RATE
-
-.. attention:: This ioctl is deprecated.
-
-Synopsis
---------
-
-.. c:function:: int ioctl(int fd, VIDEO_GET_FRAME_RATE, unsigned int *rate)
-    :name: VIDEO_GET_FRAME_RATE
-
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -  .. row 2
-
-       -  int request
-
-       -  Equals VIDEO_GET_FRAME_RATE for this command.
-
-    -  .. row 3
-
-       -  unsigned int \*rate
-
-       -  Returns the framerate in number of frames per 1000 seconds.
-
-
-Description
------------
-
-This ioctl call asks the Video Device to return the current framerate.
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
diff --git a/Documentation/media/uapi/dvb/video-get-navi.rst b/Documentation/media/uapi/dvb/video-get-navi.rst
deleted file mode 100644
index 114a9ac48b9e..000000000000
--- a/Documentation/media/uapi/dvb/video-get-navi.rst
+++ /dev/null
@@ -1,84 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _VIDEO_GET_NAVI:
-
-==============
-VIDEO_GET_NAVI
-==============
-
-Name
-----
-
-VIDEO_GET_NAVI
-
-.. attention:: This ioctl is deprecated.
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, VIDEO_GET_NAVI , struct video_navi_pack *navipack)
-    :name: VIDEO_GET_NAVI
-
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -  .. row 2
-
-       -  int request
-
-       -  Equals VIDEO_GET_NAVI for this command.
-
-    -  .. row 3
-
-       -  video_navi_pack_t \*navipack
-
-       -  PCI or DSI pack (private stream 2) according to section ??.
-
-
-Description
------------
-
-This ioctl returns navigational information from the DVD stream. This is
-especially needed if an encoded stream has to be decoded by the
-hardware.
-
-.. c:type:: video_navi_pack
-
-.. code-block::c
-
-	typedef struct video_navi_pack {
-		int length;          /* 0 ... 1024 */
-		__u8 data[1024];
-	} video_navi_pack_t;
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
-
-
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ``EFAULT``
-
-       -  driver is not able to return navigational information
diff --git a/Documentation/media/uapi/dvb/video-set-attributes.rst b/Documentation/media/uapi/dvb/video-set-attributes.rst
deleted file mode 100644
index b2f11a6746e9..000000000000
--- a/Documentation/media/uapi/dvb/video-set-attributes.rst
+++ /dev/null
@@ -1,93 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _VIDEO_SET_ATTRIBUTES:
-
-====================
-VIDEO_SET_ATTRIBUTES
-====================
-
-Name
-----
-
-VIDEO_SET_ATTRIBUTES
-
-.. attention:: This ioctl is deprecated.
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, VIDEO_SET_ATTRIBUTE ,video_attributes_t vattr)
-    :name: VIDEO_SET_ATTRIBUTE
-
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -  .. row 2
-
-       -  int request
-
-       -  Equals VIDEO_SET_ATTRIBUTE for this command.
-
-    -  .. row 3
-
-       -  video_attributes_t vattr
-
-       -  video attributes according to section ??.
-
-
-Description
------------
-
-This ioctl is intended for DVD playback and allows you to set certain
-information about the stream. Some hardware may not need this
-information, but the call also tells the hardware to prepare for DVD
-playback.
-
-.. c:type:: video_attributes_t
-
-.. code-block::c
-
-	typedef __u16 video_attributes_t;
-	/*   bits: descr. */
-	/*   15-14 Video compression mode (0=MPEG-1, 1=MPEG-2) */
-	/*   13-12 TV system (0=525/60, 1=625/50) */
-	/*   11-10 Aspect ratio (0=4:3, 3=16:9) */
-	/*    9- 8 permitted display mode on 4:3 monitor (0=both, 1=only pan-sca */
-	/*    7    line 21-1 data present in GOP (1=yes, 0=no) */
-	/*    6    line 21-2 data present in GOP (1=yes, 0=no) */
-	/*    5- 3 source resolution (0=720x480/576, 1=704x480/576, 2=352x480/57 */
-	/*    2    source letterboxed (1=yes, 0=no) */
-	/*    0    film/camera mode (0=camera, 1=film (625/50 only)) */
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
-
-
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ``EINVAL``
-
-       -  input is not a valid attribute setting.
diff --git a/Documentation/media/uapi/dvb/video-set-highlight.rst b/Documentation/media/uapi/dvb/video-set-highlight.rst
deleted file mode 100644
index 90aeafd923b7..000000000000
--- a/Documentation/media/uapi/dvb/video-set-highlight.rst
+++ /dev/null
@@ -1,86 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _VIDEO_SET_HIGHLIGHT:
-
-===================
-VIDEO_SET_HIGHLIGHT
-===================
-
-Name
-----
-
-VIDEO_SET_HIGHLIGHT
-
-.. attention:: This ioctl is deprecated.
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, VIDEO_SET_HIGHLIGHT, struct video_highlight *vhilite)
-    :name: VIDEO_SET_HIGHLIGHT
-
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -  .. row 2
-
-       -  int request
-
-       -  Equals VIDEO_SET_HIGHLIGHT for this command.
-
-    -  .. row 3
-
-       -  video_highlight_t \*vhilite
-
-       -  SPU Highlight information according to section ??.
-
-
-Description
------------
-
-This ioctl sets the SPU highlight information for the menu access of a
-DVD.
-
-.. c:type:: video_highlight
-
-.. code-block:: c
-
-	typedef
-	struct video_highlight {
-		int     active;      /*    1=show highlight, 0=hide highlight */
-		__u8    contrast1;   /*    7- 4  Pattern pixel contrast */
-				/*    3- 0  Background pixel contrast */
-		__u8    contrast2;   /*    7- 4  Emphasis pixel-2 contrast */
-				/*    3- 0  Emphasis pixel-1 contrast */
-		__u8    color1;      /*    7- 4  Pattern pixel color */
-				/*    3- 0  Background pixel color */
-		__u8    color2;      /*    7- 4  Emphasis pixel-2 color */
-				/*    3- 0  Emphasis pixel-1 color */
-		__u32    ypos;       /*   23-22  auto action mode */
-				/*   21-12  start y */
-				/*    9- 0  end y */
-		__u32    xpos;       /*   23-22  button color number */
-				/*   21-12  start x */
-				/*    9- 0  end x */
-	} video_highlight_t;
-
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
diff --git a/Documentation/media/uapi/dvb/video-set-id.rst b/Documentation/media/uapi/dvb/video-set-id.rst
deleted file mode 100644
index 18f66875ae3f..000000000000
--- a/Documentation/media/uapi/dvb/video-set-id.rst
+++ /dev/null
@@ -1,75 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _VIDEO_SET_ID:
-
-============
-VIDEO_SET_ID
-============
-
-Name
-----
-
-VIDEO_SET_ID
-
-.. attention:: This ioctl is deprecated.
-
-Synopsis
---------
-
-.. c:function:: int ioctl(int fd, VIDEO_SET_ID, int id)
-    :name: VIDEO_SET_ID
-
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -  .. row 2
-
-       -  int request
-
-       -  Equals VIDEO_SET_ID for this command.
-
-    -  .. row 3
-
-       -  int id
-
-       -  video sub-stream id
-
-
-Description
------------
-
-This ioctl selects which sub-stream is to be decoded if a program or
-system stream is sent to the video device.
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
-
-
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ``EINVAL``
-
-       -  Invalid sub-stream id.
diff --git a/Documentation/media/uapi/dvb/video-set-spu.rst b/Documentation/media/uapi/dvb/video-set-spu.rst
deleted file mode 100644
index 739e5e7bd133..000000000000
--- a/Documentation/media/uapi/dvb/video-set-spu.rst
+++ /dev/null
@@ -1,85 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _VIDEO_SET_SPU:
-
-=============
-VIDEO_SET_SPU
-=============
-
-Name
-----
-
-VIDEO_SET_SPU
-
-.. attention:: This ioctl is deprecated.
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, VIDEO_SET_SPU , struct video_spu *spu)
-    :name: VIDEO_SET_SPU
-
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -  .. row 2
-
-       -  int request
-
-       -  Equals VIDEO_SET_SPU for this command.
-
-    -  .. row 3
-
-       -  video_spu_t \*spu
-
-       -  SPU decoding (de)activation and subid setting according to section
-	  ??.
-
-
-Description
------------
-
-This ioctl activates or deactivates SPU decoding in a DVD input stream.
-It can only be used, if the driver is able to handle a DVD stream.
-
-.. c:type:: struct video_spu
-
-.. code-block:: c
-
-	typedef struct video_spu {
-		int active;
-		int stream_id;
-	} video_spu_t;
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
-
-
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ``EINVAL``
-
-       -  input is not a valid spu setting or driver cannot handle SPU.
diff --git a/Documentation/media/uapi/dvb/video-set-system.rst b/Documentation/media/uapi/dvb/video-set-system.rst
deleted file mode 100644
index e39cbe080ef7..000000000000
--- a/Documentation/media/uapi/dvb/video-set-system.rst
+++ /dev/null
@@ -1,77 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _VIDEO_SET_SYSTEM:
-
-================
-VIDEO_SET_SYSTEM
-================
-
-Name
-----
-
-VIDEO_SET_SYSTEM
-
-.. attention:: This ioctl is deprecated.
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, VIDEO_SET_SYSTEM , video_system_t system)
-    :name: VIDEO_SET_SYSTEM
-
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -  .. row 2
-
-       -  int request
-
-       -  Equals VIDEO_SET_FORMAT for this command.
-
-    -  .. row 3
-
-       -  video_system_t system
-
-       -  video system of TV output.
-
-
-Description
------------
-
-This ioctl sets the television output format. The format (see section
-??) may vary from the color format of the displayed MPEG stream. If the
-hardware is not able to display the requested format the call will
-return an error.
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
-
-
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ``EINVAL``
-
-       -  system is not a valid or supported video system.
diff --git a/Documentation/media/uapi/dvb/video_function_calls.rst b/Documentation/media/uapi/dvb/video_function_calls.rst
index 8d8383ffaeba..3f4f6c9ffad7 100644
--- a/Documentation/media/uapi/dvb/video_function_calls.rst
+++ b/Documentation/media/uapi/dvb/video_function_calls.rst
@@ -21,7 +21,6 @@ Video Function Calls
     video-get-status
     video-get-frame-count
     video-get-pts
-    video-get-frame-rate
     video-get-event
     video-command
     video-try-command
@@ -31,12 +30,7 @@ Video Function Calls
     video-fast-forward
     video-slowmotion
     video-get-capabilities
-    video-set-id
     video-clear-buffer
     video-set-streamtype
     video-set-format
-    video-set-system
-    video-set-highlight
-    video-set-spu
-    video-get-navi
     video-set-attributes
diff --git a/Documentation/media/uapi/dvb/video_types.rst b/Documentation/media/uapi/dvb/video_types.rst
index 4cfa00e5c934..a0942171596c 100644
--- a/Documentation/media/uapi/dvb/video_types.rst
+++ b/Documentation/media/uapi/dvb/video_types.rst
@@ -246,116 +246,3 @@ following bits set according to the hardwares capabilities.
      #define VIDEO_CAP_SPU    16
      #define VIDEO_CAP_NAVI   32
      #define VIDEO_CAP_CSS    64
-
-
-.. _video-system:
-
-video_system_t
-==============
-
-A call to VIDEO_SET_SYSTEM sets the desired video system for TV
-output. The following system types can be set:
-
-
-.. code-block:: c
-
-    typedef enum {
-	 VIDEO_SYSTEM_PAL,
-	 VIDEO_SYSTEM_NTSC,
-	 VIDEO_SYSTEM_PALN,
-	 VIDEO_SYSTEM_PALNc,
-	 VIDEO_SYSTEM_PALM,
-	 VIDEO_SYSTEM_NTSC60,
-	 VIDEO_SYSTEM_PAL60,
-	 VIDEO_SYSTEM_PALM60
-    } video_system_t;
-
-
-.. c:type:: video_highlight
-
-struct video_highlight
-======================
-
-Calling the ioctl VIDEO_SET_HIGHLIGHTS posts the SPU highlight
-information. The call expects the following format for that information:
-
-
-.. code-block:: c
-
-     typedef
-     struct video_highlight {
-	 boolean active;      /*    1=show highlight, 0=hide highlight */
-	 uint8_t contrast1;   /*    7- 4  Pattern pixel contrast */
-		      /*    3- 0  Background pixel contrast */
-	 uint8_t contrast2;   /*    7- 4  Emphasis pixel-2 contrast */
-		      /*    3- 0  Emphasis pixel-1 contrast */
-	 uint8_t color1;      /*    7- 4  Pattern pixel color */
-		      /*    3- 0  Background pixel color */
-	 uint8_t color2;      /*    7- 4  Emphasis pixel-2 color */
-		      /*    3- 0  Emphasis pixel-1 color */
-	 uint32_t ypos;       /*   23-22  auto action mode */
-		      /*   21-12  start y */
-		      /*    9- 0  end y */
-	 uint32_t xpos;       /*   23-22  button color number */
-		      /*   21-12  start x */
-		      /*    9- 0  end x */
-     } video_highlight_t;
-
-
-.. c:type:: video_spu
-
-struct video_spu
-================
-
-Calling VIDEO_SET_SPU deactivates or activates SPU decoding, according
-to the following format:
-
-
-.. code-block:: c
-
-     typedef
-     struct video_spu {
-	 boolean active;
-	 int stream_id;
-     } video_spu_t;
-
-
-.. c:type:: video_navi_pack
-
-struct video_navi_pack
-======================
-
-In order to get the navigational data the following structure has to be
-passed to the ioctl VIDEO_GET_NAVI:
-
-
-.. code-block:: c
-
-     typedef
-     struct video_navi_pack {
-	 int length;         /* 0 ... 1024 */
-	 uint8_t data[1024];
-     } video_navi_pack_t;
-
-
-.. _video-attributes-t:
-
-video_attributes_t
-==================
-
-The following attributes can be set by a call to VIDEO_SET_ATTRIBUTES:
-
-
-.. code-block:: c
-
-     typedef uint16_t video_attributes_t;
-     /*   bits: descr. */
-     /*   15-14 Video compression mode (0=MPEG-1, 1=MPEG-2) */
-     /*   13-12 TV system (0=525/60, 1=625/50) */
-     /*   11-10 Aspect ratio (0=4:3, 3=16:9) */
-     /*    9- 8 permitted display mode on 4:3 monitor (0=both, 1=only pan-sca */
-     /*    7    line 21-1 data present in GOP (1=yes, 0=no) */
-     /*    6    line 21-2 data present in GOP (1=yes, 0=no) */
-     /*    5- 3 source resolution (0=720x480/576, 1=704x480/576, 2=352x480/57 */
-     /*    2    source letterboxed (1=yes, 0=no) */
-     /*    0    film/camera mode (0=camera, 1=film (625/50 only)) */
diff --git a/Documentation/media/video.h.rst.exceptions b/Documentation/media/video.h.rst.exceptions
index 89d7c3ef2da7..371cdbd7d062 100644
--- a/Documentation/media/video.h.rst.exceptions
+++ b/Documentation/media/video.h.rst.exceptions
@@ -34,6 +34,4 @@ replace typedef video_displayformat_t :c:type:`video_displayformat`
 replace typedef video_size_t :c:type:`video_size`
 replace typedef video_stream_source_t :c:type:`video_stream_source`
 replace typedef video_play_state_t :c:type:`video_play_state`
-replace typedef video_highlight_t :c:type:`video_highlight`
-replace typedef video_spu_t :c:type:`video_spu`
 replace typedef video_navi_pack_t :c:type:`video_navi_pack`
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index fdb5ef9b5d06..59216b172003 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1205,16 +1205,9 @@ COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD)
 COMPATIBLE_IOCTL(VIDEO_SLOWMOTION)
 COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES)
 COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER)
-COMPATIBLE_IOCTL(VIDEO_SET_ID)
 COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE)
 COMPATIBLE_IOCTL(VIDEO_SET_FORMAT)
-COMPATIBLE_IOCTL(VIDEO_SET_SYSTEM)
-COMPATIBLE_IOCTL(VIDEO_SET_HIGHLIGHT)
-COMPATIBLE_IOCTL(VIDEO_SET_SPU)
-COMPATIBLE_IOCTL(VIDEO_GET_NAVI)
-COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES)
 COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
-COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE)
 /* cec */
 COMPATIBLE_IOCTL(CEC_ADAP_G_CAPS)
 COMPATIBLE_IOCTL(CEC_ADAP_G_LOG_ADDRS)
diff --git a/include/uapi/linux/dvb/video.h b/include/uapi/linux/dvb/video.h
index 6a0c9757b7ba..43ba8b0a3d14 100644
--- a/include/uapi/linux/dvb/video.h
+++ b/include/uapi/linux/dvb/video.h
@@ -37,18 +37,6 @@ typedef enum {
 } video_format_t;
 
 
-typedef enum {
-	 VIDEO_SYSTEM_PAL,
-	 VIDEO_SYSTEM_NTSC,
-	 VIDEO_SYSTEM_PALN,
-	 VIDEO_SYSTEM_PALNc,
-	 VIDEO_SYSTEM_PALM,
-	 VIDEO_SYSTEM_NTSC60,
-	 VIDEO_SYSTEM_PAL60,
-	 VIDEO_SYSTEM_PALM60
-} video_system_t;
-
-
 typedef enum {
 	VIDEO_PAN_SCAN,       /* use pan and scan format */
 	VIDEO_LETTER_BOX,     /* use letterbox format */
@@ -160,38 +148,6 @@ struct video_still_picture {
 };
 
 
-typedef
-struct video_highlight {
-	int     active;      /*    1=show highlight, 0=hide highlight */
-	__u8    contrast1;   /*    7- 4  Pattern pixel contrast */
-			     /*    3- 0  Background pixel contrast */
-	__u8    contrast2;   /*    7- 4  Emphasis pixel-2 contrast */
-			     /*    3- 0  Emphasis pixel-1 contrast */
-	__u8    color1;      /*    7- 4  Pattern pixel color */
-			     /*    3- 0  Background pixel color */
-	__u8    color2;      /*    7- 4  Emphasis pixel-2 color */
-			     /*    3- 0  Emphasis pixel-1 color */
-	__u32    ypos;       /*   23-22  auto action mode */
-			     /*   21-12  start y */
-			     /*    9- 0  end y */
-	__u32    xpos;       /*   23-22  button color number */
-			     /*   21-12  start x */
-			     /*    9- 0  end x */
-} video_highlight_t;
-
-
-typedef struct video_spu {
-	int active;
-	int stream_id;
-} video_spu_t;
-
-
-typedef struct video_navi_pack {
-	int length;          /* 0 ... 1024 */
-	__u8 data[1024];
-} video_navi_pack_t;
-
-
 typedef __u16 video_attributes_t;
 /*   bits: descr. */
 /*   15-14 Video compression mode (0=MPEG-1, 1=MPEG-2) */
@@ -236,16 +192,9 @@ typedef __u16 video_attributes_t;
 #define VIDEO_SLOWMOTION           _IO('o', 32)
 #define VIDEO_GET_CAPABILITIES     _IOR('o', 33, unsigned int)
 #define VIDEO_CLEAR_BUFFER         _IO('o',  34)
-#define VIDEO_SET_ID               _IO('o', 35)
 #define VIDEO_SET_STREAMTYPE       _IO('o', 36)
 #define VIDEO_SET_FORMAT           _IO('o', 37)
-#define VIDEO_SET_SYSTEM           _IO('o', 38)
-#define VIDEO_SET_HIGHLIGHT        _IOW('o', 39, video_highlight_t)
-#define VIDEO_SET_SPU              _IOW('o', 50, video_spu_t)
-#define VIDEO_GET_NAVI             _IOR('o', 52, video_navi_pack_t)
-#define VIDEO_SET_ATTRIBUTES       _IO('o', 53)
 #define VIDEO_GET_SIZE             _IOR('o', 55, video_size_t)
-#define VIDEO_GET_FRAME_RATE       _IOR('o', 56, unsigned int)
 
 /**
  * VIDEO_GET_PTS
-- 
cgit v1.2.3


From d21c249b26311dd193b100e65fc9e7ae96233d40 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Wed, 30 May 2018 11:07:04 -0400
Subject: media: dvb/audio.h: get rid of unused APIs

There are a number of other ioctls that aren't used anywhere
inside the Kernel tree.

Get rid of them.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/audio.h.rst.exceptions         |  3 -
 Documentation/media/uapi/dvb/audio-get-pts.rst     | 65 ---------------------
 .../media/uapi/dvb/audio-set-attributes.rst        | 67 ----------------------
 Documentation/media/uapi/dvb/audio-set-ext-id.rst  | 66 ---------------------
 Documentation/media/uapi/dvb/audio-set-karaoke.rst | 66 ---------------------
 Documentation/media/uapi/dvb/audio_data_types.rst  | 37 ------------
 .../media/uapi/dvb/audio_function_calls.rst        |  4 --
 fs/compat_ioctl.c                                  |  3 -
 include/uapi/linux/dvb/audio.h                     | 37 ------------
 9 files changed, 348 deletions(-)
 delete mode 100644 Documentation/media/uapi/dvb/audio-get-pts.rst
 delete mode 100644 Documentation/media/uapi/dvb/audio-set-attributes.rst
 delete mode 100644 Documentation/media/uapi/dvb/audio-set-ext-id.rst
 delete mode 100644 Documentation/media/uapi/dvb/audio-set-karaoke.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/audio.h.rst.exceptions b/Documentation/media/audio.h.rst.exceptions
index f40f3cbfe4c9..940458774cf6 100644
--- a/Documentation/media/audio.h.rst.exceptions
+++ b/Documentation/media/audio.h.rst.exceptions
@@ -1,9 +1,6 @@
 # Ignore header name
 ignore define _DVBAUDIO_H_
 
-# Typedef pointing to structs
-replace typedef audio_karaoke_t :c:type:`audio_karaoke`
-
 # Undocumented audio caps, as this is a deprecated API anyway
 ignore define AUDIO_CAP_DTS
 ignore define AUDIO_CAP_LPCM
diff --git a/Documentation/media/uapi/dvb/audio-get-pts.rst b/Documentation/media/uapi/dvb/audio-get-pts.rst
deleted file mode 100644
index 2d1396b003de..000000000000
--- a/Documentation/media/uapi/dvb/audio-get-pts.rst
+++ /dev/null
@@ -1,65 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _AUDIO_GET_PTS:
-
-=============
-AUDIO_GET_PTS
-=============
-
-Name
-----
-
-AUDIO_GET_PTS
-
-.. attention:: This ioctl is deprecated
-
-Synopsis
---------
-
-.. c:function:: int ioctl(int fd, AUDIO_GET_PTS, __u64 *pts)
-    :name: AUDIO_GET_PTS
-
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -
-
-       -  __u64 \*pts
-
-       -  Returns the 33-bit timestamp as defined in ITU T-REC-H.222.0 /
-	  ISO/IEC 13818-1.
-
-	  The PTS should belong to the currently played frame if possible,
-	  but may also be a value close to it like the PTS of the last
-	  decoded frame or the last PTS extracted by the PES parser.
-
-
-Description
------------
-
-This ioctl is obsolete. Do not use in new drivers. If you need this
-functionality, then please contact the linux-media mailing list
-(`https://linuxtv.org/lists.php <https://linuxtv.org/lists.php>`__).
-
-This ioctl call asks the Audio Device to return the current PTS
-timestamp.
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
diff --git a/Documentation/media/uapi/dvb/audio-set-attributes.rst b/Documentation/media/uapi/dvb/audio-set-attributes.rst
deleted file mode 100644
index f0c6153ca80f..000000000000
--- a/Documentation/media/uapi/dvb/audio-set-attributes.rst
+++ /dev/null
@@ -1,67 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _AUDIO_SET_ATTRIBUTES:
-
-====================
-AUDIO_SET_ATTRIBUTES
-====================
-
-Name
-----
-
-AUDIO_SET_ATTRIBUTES
-
-.. attention:: This ioctl is deprecated
-
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, AUDIO_SET_ATTRIBUTES, struct audio_attributes *attr )
-    :name: AUDIO_SET_ATTRIBUTES
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -
-
-       -  audio_attributes_t attr
-
-       -  audio attributes according to section ??
-
-
-Description
------------
-
-This ioctl is intended for DVD playback and allows you to set certain
-information about the audio stream.
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ``EINVAL``
-
-       -  attr is not a valid or supported attribute setting.
diff --git a/Documentation/media/uapi/dvb/audio-set-ext-id.rst b/Documentation/media/uapi/dvb/audio-set-ext-id.rst
deleted file mode 100644
index 8503c47f26bd..000000000000
--- a/Documentation/media/uapi/dvb/audio-set-ext-id.rst
+++ /dev/null
@@ -1,66 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _AUDIO_SET_EXT_ID:
-
-================
-AUDIO_SET_EXT_ID
-================
-
-Name
-----
-
-AUDIO_SET_EXT_ID
-
-.. attention:: This ioctl is deprecated
-
-Synopsis
---------
-
-.. c:function:: int  ioctl(fd, AUDIO_SET_EXT_ID, int id)
-    :name: AUDIO_SET_EXT_ID
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -
-
-       -  int id
-
-       -  audio sub_stream_id
-
-
-Description
------------
-
-This ioctl can be used to set the extension id for MPEG streams in DVD
-playback. Only the first 3 bits are recognized.
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ``EINVAL``
-
-       -  id is not a valid id.
diff --git a/Documentation/media/uapi/dvb/audio-set-karaoke.rst b/Documentation/media/uapi/dvb/audio-set-karaoke.rst
deleted file mode 100644
index c759952d88aa..000000000000
--- a/Documentation/media/uapi/dvb/audio-set-karaoke.rst
+++ /dev/null
@@ -1,66 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _AUDIO_SET_KARAOKE:
-
-=================
-AUDIO_SET_KARAOKE
-=================
-
-Name
-----
-
-AUDIO_SET_KARAOKE
-
-.. attention:: This ioctl is deprecated
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, AUDIO_SET_KARAOKE, struct audio_karaoke *karaoke)
-    :name: AUDIO_SET_KARAOKE
-
-
-Arguments
----------
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -
-
-       -  int fd
-
-       -  File descriptor returned by a previous call to open().
-
-    -
-
-       -  audio_karaoke_t \*karaoke
-
-       -  karaoke settings according to section ??.
-
-
-Description
------------
-
-This ioctl allows one to set the mixer settings for a karaoke DVD.
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ``EINVAL``
-
-       -  karaoke is not a valid or supported karaoke setting.
diff --git a/Documentation/media/uapi/dvb/audio_data_types.rst b/Documentation/media/uapi/dvb/audio_data_types.rst
index 6b93359d64f7..5bffa2c98a24 100644
--- a/Documentation/media/uapi/dvb/audio_data_types.rst
+++ b/Documentation/media/uapi/dvb/audio_data_types.rst
@@ -114,40 +114,3 @@ following bits set according to the hardwares capabilities.
      #define AUDIO_CAP_OGG   64
      #define AUDIO_CAP_SDDS 128
      #define AUDIO_CAP_AC3  256
-
-.. c:type:: audio_karaoke
-
-The ioctl AUDIO_SET_KARAOKE uses the following format:
-
-
-.. code-block:: c
-
-    typedef
-    struct audio_karaoke {
-	int vocal1;
-	int vocal2;
-	int melody;
-    } audio_karaoke_t;
-
-If Vocal1 or Vocal2 are non-zero, they get mixed into left and right t
-at 70% each. If both, Vocal1 and Vocal2 are non-zero, Vocal1 gets mixed
-into the left channel and Vocal2 into the right channel at 100% each. Ff
-Melody is non-zero, the melody channel gets mixed into left and right.
-
-
-.. c:type:: audio_attributes
-
-The following attributes can be set by a call to AUDIO_SET_ATTRIBUTES:
-
-
-.. code-block:: c
-
-     typedef uint16_t audio_attributes_t;
-     /*   bits: descr. */
-     /*   15-13 audio coding mode (0=ac3, 2=mpeg1, 3=mpeg2ext, 4=LPCM, 6=DTS, */
-     /*   12    multichannel extension */
-     /*   11-10 audio type (0=not spec, 1=language included) */
-     /*    9- 8 audio application mode (0=not spec, 1=karaoke, 2=surround) */
-     /*    7- 6 Quantization / DRC (mpeg audio: 1=DRC exists)(lpcm: 0=16bit,  */
-     /*    5- 4 Sample frequency fs (0=48kHz, 1=96kHz) */
-     /*    2- 0 number of audio channels (n+1 channels) */
diff --git a/Documentation/media/uapi/dvb/audio_function_calls.rst b/Documentation/media/uapi/dvb/audio_function_calls.rst
index 0bb56f0cfed4..7dba16285dab 100644
--- a/Documentation/media/uapi/dvb/audio_function_calls.rst
+++ b/Documentation/media/uapi/dvb/audio_function_calls.rst
@@ -22,13 +22,9 @@ Audio Function Calls
     audio-set-bypass-mode
     audio-channel-select
     audio-bilingual-channel-select
-    audio-get-pts
     audio-get-status
     audio-get-capabilities
     audio-clear-buffer
     audio-set-id
     audio-set-mixer
     audio-set-streamtype
-    audio-set-ext-id
-    audio-set-attributes
-    audio-set-karaoke
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 59216b172003..a9b00942e87d 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1178,9 +1178,6 @@ COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER)
 COMPATIBLE_IOCTL(AUDIO_SET_ID)
 COMPATIBLE_IOCTL(AUDIO_SET_MIXER)
 COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE)
-COMPATIBLE_IOCTL(AUDIO_SET_EXT_ID)
-COMPATIBLE_IOCTL(AUDIO_SET_ATTRIBUTES)
-COMPATIBLE_IOCTL(AUDIO_SET_KARAOKE)
 COMPATIBLE_IOCTL(DMX_START)
 COMPATIBLE_IOCTL(DMX_STOP)
 COMPATIBLE_IOCTL(DMX_SET_FILTER)
diff --git a/include/uapi/linux/dvb/audio.h b/include/uapi/linux/dvb/audio.h
index 69f7a85d81b1..afeae063e640 100644
--- a/include/uapi/linux/dvb/audio.h
+++ b/include/uapi/linux/dvb/audio.h
@@ -67,27 +67,6 @@ typedef struct audio_status {
 } audio_status_t;                              /* separate decoder hardware */
 
 
-typedef
-struct audio_karaoke {  /* if Vocal1 or Vocal2 are non-zero, they get mixed  */
-	int vocal1;    /* into left and right t at 70% each */
-	int vocal2;    /* if both, Vocal1 and Vocal2 are non-zero, Vocal1 gets*/
-	int melody;    /* mixed into the left channel and */
-		       /* Vocal2 into the right channel at 100% each. */
-		       /* if Melody is non-zero, the melody channel gets mixed*/
-} audio_karaoke_t;     /* into left and right  */
-
-
-typedef __u16 audio_attributes_t;
-/*   bits: descr. */
-/*   15-13 audio coding mode (0=ac3, 2=mpeg1, 3=mpeg2ext, 4=LPCM, 6=DTS, */
-/*   12    multichannel extension */
-/*   11-10 audio type (0=not spec, 1=language included) */
-/*    9- 8 audio application mode (0=not spec, 1=karaoke, 2=surround) */
-/*    7- 6 Quantization / DRC (mpeg audio: 1=DRC exists)(lpcm: 0=16bit,  */
-/*    5- 4 Sample frequency fs (0=48kHz, 1=96kHz) */
-/*    2- 0 number of audio channels (n+1 channels) */
-
-
 /* for GET_CAPABILITIES and SET_FORMAT, the latter should only set one bit */
 #define AUDIO_CAP_DTS    1
 #define AUDIO_CAP_LPCM   2
@@ -115,22 +94,6 @@ typedef __u16 audio_attributes_t;
 #define AUDIO_SET_ID               _IO('o', 13)
 #define AUDIO_SET_MIXER            _IOW('o', 14, audio_mixer_t)
 #define AUDIO_SET_STREAMTYPE       _IO('o', 15)
-#define AUDIO_SET_EXT_ID           _IO('o', 16)
-#define AUDIO_SET_ATTRIBUTES       _IOW('o', 17, audio_attributes_t)
-#define AUDIO_SET_KARAOKE          _IOW('o', 18, audio_karaoke_t)
-
-/**
- * AUDIO_GET_PTS
- *
- * Read the 33 bit presentation time stamp as defined
- * in ITU T-REC-H.222.0 / ISO/IEC 13818-1.
- *
- * The PTS should belong to the currently played
- * frame if possible, but may also be a value close to it
- * like the PTS of the last decoded frame or the last PTS
- * extracted by the PES parser.
- */
-#define AUDIO_GET_PTS              _IOR('o', 19, __u64)
 #define AUDIO_BILINGUAL_CHANNEL_SELECT _IO('o', 20)
 
 #endif /* _DVBAUDIO_H_ */
-- 
cgit v1.2.3


From a449938297e55e7e8958f8b48583f7d342da1930 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Fri, 13 Jul 2018 11:28:31 +0100
Subject: KVM: s390: Add huge page enablement control

General KVM huge page support on s390 has to be enabled via the
kvm.hpage module parameter. Either nested or hpage can be enabled, as
we currently do not support vSIE for huge backed guests. Once the vSIE
support is added we will either drop the parameter or enable it as
default.

For a guest the feature has to be enabled through the new
KVM_CAP_S390_HPAGE_1M capability and the hpage module
parameter. Enabling it means that cmm can't be enabled for the vm and
disables pfmf and storage key interpretation.

This is due to the fact that in some cases, in upcoming patches, we
have to split huge pages in the guest mapping to be able to set more
granular memory protection on 4k pages. These split pages have fake
page tables that are not visible to the Linux memory management which
subsequently will not manage its PGSTEs, while the SIE will. Disabling
these features lets us manage PGSTE data in a consistent matter and
solve that problem.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 16 +++++++++++++++
 arch/s390/kvm/kvm-s390.c          | 42 +++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/kvm.h          |  1 +
 3 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index d10944e619d3..cb8db4f9d097 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4391,6 +4391,22 @@ all such vmexits.
 
 Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits.
 
+7.14 KVM_CAP_S390_HPAGE_1M
+
+Architectures: s390
+Parameters: none
+Returns: 0 on success, -EINVAL if hpage module parameter was not set
+	 or cmma is enabled
+
+With this capability the KVM support for memory backing with 1m pages
+through hugetlbfs can be enabled for a VM. After the capability is
+enabled, cmma can't be enabled anymore and pfmfi and the storage key
+interpretation are disabled. If cmma has already been enabled or the
+hpage module parameter is not set to 1, -EINVAL is returned.
+
+While it is generally possible to create a huge page backed VM without
+this capability, the VM will not be able to run.
+
 8. Other capabilities.
 ----------------------
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 662f4d8046db..f9d90337e64a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -172,6 +172,10 @@ static int nested;
 module_param(nested, int, S_IRUGO);
 MODULE_PARM_DESC(nested, "Nested virtualization support");
 
+/* allow 1m huge page guest backing, if !nested */
+static int hpage;
+module_param(hpage, int, 0444);
+MODULE_PARM_DESC(hpage, "1m huge page backing support");
 
 /*
  * For now we handle at most 16 double words as this is what the s390 base
@@ -475,6 +479,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_AIS_MIGRATION:
 		r = 1;
 		break;
+	case KVM_CAP_S390_HPAGE_1M:
+		r = 0;
+		if (hpage)
+			r = 1;
+		break;
 	case KVM_CAP_S390_MEM_OP:
 		r = MEM_OP_MAX_SIZE;
 		break;
@@ -678,6 +687,27 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_GS %s",
 			 r ? "(not available)" : "(success)");
 		break;
+	case KVM_CAP_S390_HPAGE_1M:
+		mutex_lock(&kvm->lock);
+		if (kvm->created_vcpus)
+			r = -EBUSY;
+		else if (!hpage || kvm->arch.use_cmma)
+			r = -EINVAL;
+		else {
+			r = 0;
+			kvm->mm->context.allow_gmap_hpage_1m = 1;
+			/*
+			 * We might have to create fake 4k page
+			 * tables. To avoid that the hardware works on
+			 * stale PGSTEs, we emulate these instructions.
+			 */
+			kvm->arch.use_skf = 0;
+			kvm->arch.use_pfmfi = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_HPAGE %s",
+			 r ? "(not available)" : "(success)");
+		break;
 	case KVM_CAP_S390_USER_STSI:
 		VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI");
 		kvm->arch.user_stsi = 1;
@@ -725,10 +755,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 		if (!sclp.has_cmma)
 			break;
 
-		ret = -EBUSY;
 		VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
 		mutex_lock(&kvm->lock);
-		if (!kvm->created_vcpus) {
+		if (kvm->created_vcpus)
+			ret = -EBUSY;
+		else if (kvm->mm->context.allow_gmap_hpage_1m)
+			ret = -EINVAL;
+		else {
 			kvm->arch.use_cmma = 1;
 			/* Not compatible with cmma. */
 			kvm->arch.use_pfmfi = 0;
@@ -4102,6 +4135,11 @@ static int __init kvm_s390_init(void)
 		return -ENODEV;
 	}
 
+	if (nested && hpage) {
+		pr_info("nested (vSIE) and hpage (huge page backing) can currently not be activated concurrently");
+		return -EINVAL;
+	}
+
 	for (i = 0; i < 16; i++)
 		kvm_s390_fac_base[i] |=
 			S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index b6270a3b38e9..b955b986b341 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -949,6 +949,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_GET_MSR_FEATURES 153
 #define KVM_CAP_HYPERV_EVENTFD 154
 #define KVM_CAP_HYPERV_TLBFLUSH 155
+#define KVM_CAP_S390_HPAGE_1M 156
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From d692f1138a4bac2efd2c8656ca15556b63479e82 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Mon, 30 Jul 2018 17:42:28 -0700
Subject: bpf: Support bpf_get_socket_cookie in more prog types

bpf_get_socket_cookie() helper can be used to identify skb that
correspond to the same socket.

Though socket cookie can be useful in many other use-cases where socket is
available in program context. Specifically BPF_PROG_TYPE_CGROUP_SOCK_ADDR
and BPF_PROG_TYPE_SOCK_OPS programs can benefit from it so that one of
them can augment a value in a map prepared earlier by other program for
the same socket.

The patch adds support to call bpf_get_socket_cookie() from
BPF_PROG_TYPE_CGROUP_SOCK_ADDR and BPF_PROG_TYPE_SOCK_OPS.

It doesn't introduce new helpers. Instead it reuses same helper name
bpf_get_socket_cookie() but adds support to this helper to accept
`struct bpf_sock_addr` and `struct bpf_sock_ops`.

Documentation in bpf.h is changed in a way that should not break
automatic generation of markdown.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 14 ++++++++++++++
 net/core/filter.c        | 28 ++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 870113916cac..0ebaaf7f3568 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1371,6 +1371,20 @@ union bpf_attr {
  * 		A 8-byte long non-decreasing number on success, or 0 if the
  * 		socket field is missing inside *skb*.
  *
+ * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
+ * 	Description
+ * 		Equivalent to bpf_get_socket_cookie() helper that accepts
+ * 		*skb*, but gets socket from **struct bpf_sock_addr** contex.
+ * 	Return
+ * 		A 8-byte long non-decreasing number.
+ *
+ * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
+ * 	Description
+ * 		Equivalent to bpf_get_socket_cookie() helper that accepts
+ * 		*skb*, but gets socket from **struct bpf_sock_ops** contex.
+ * 	Return
+ * 		A 8-byte long non-decreasing number.
+ *
  * u32 bpf_get_socket_uid(struct sk_buff *skb)
  * 	Return
  * 		The owner UID of the socket associated to *skb*. If the socket
diff --git a/net/core/filter.c b/net/core/filter.c
index 7df1a0f1d1e1..9bb9a4488e25 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3812,6 +3812,30 @@ static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+	return sock_gen_cookie(ctx->sk);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
+	.func		= bpf_get_socket_cookie_sock_addr,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
+{
+	return sock_gen_cookie(ctx->sk);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
+	.func		= bpf_get_socket_cookie_sock_ops,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
 {
 	struct sock *sk = sk_to_full_sk(skb->sk);
@@ -4818,6 +4842,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		default:
 			return NULL;
 		}
+	case BPF_FUNC_get_socket_cookie:
+		return &bpf_get_socket_cookie_sock_addr_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4960,6 +4986,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sock_map_update_proto;
 	case BPF_FUNC_sock_hash_update:
 		return &bpf_sock_hash_update_proto;
+	case BPF_FUNC_get_socket_cookie:
+		return &bpf_get_socket_cookie_sock_ops_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 54c990775f78113a708f24e15877f6b7bd9a1277 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Mon, 2 Jul 2018 04:21:19 -0400
Subject: macintosh/via-pmu68k: Don't load driver on unsupported hardware

Don't load the via-pmu68k driver on early PowerBooks. The M50753 PMU
device found in those models was never supported by this driver.
Attempting to load the driver usually causes a boot hang.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Reviewed-by: Michael Schmitz <schmitzmic@gmail.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/m68k/mac/misc.c           | 6 ++----
 drivers/macintosh/via-pmu68k.c | 4 ----
 include/uapi/linux/pmu.h       | 2 +-
 3 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c
index c68054361615..7ccb799eeb57 100644
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -478,8 +478,7 @@ void mac_poweroff(void)
 		cuda_shutdown();
 #endif
 #ifdef CONFIG_ADB_PMU68K
-	} else if (macintosh_config->adb_type == MAC_ADB_PB1
-		|| macintosh_config->adb_type == MAC_ADB_PB2) {
+	} else if (macintosh_config->adb_type == MAC_ADB_PB2) {
 		pmu_shutdown();
 #endif
 	}
@@ -520,8 +519,7 @@ void mac_reset(void)
 		cuda_restart();
 #endif
 #ifdef CONFIG_ADB_PMU68K
-	} else if (macintosh_config->adb_type == MAC_ADB_PB1
-		|| macintosh_config->adb_type == MAC_ADB_PB2) {
+	} else if (macintosh_config->adb_type == MAC_ADB_PB2) {
 		pmu_restart();
 #endif
 	} else if (CPU_IS_030) {
diff --git a/drivers/macintosh/via-pmu68k.c b/drivers/macintosh/via-pmu68k.c
index d545ed45e482..bec8e1837d7d 100644
--- a/drivers/macintosh/via-pmu68k.c
+++ b/drivers/macintosh/via-pmu68k.c
@@ -175,9 +175,6 @@ static s8 pmu_data_len[256][2] = {
 int __init find_via_pmu(void)
 {
 	switch (macintosh_config->adb_type) {
-	case MAC_ADB_PB1:
-		pmu_kind = PMU_68K_V1;
-		break;
 	case MAC_ADB_PB2:
 		pmu_kind = PMU_68K_V2;
 		break;
@@ -785,7 +782,6 @@ pmu_enable_backlight(int on)
 	    /* first call: get current backlight value */
 	    if (backlight_level < 0) {
 		switch(pmu_kind) {
-		    case PMU_68K_V1:
 		    case PMU_68K_V2:
 			pmu_request(&req, NULL, 3, PMU_READ_NVRAM, 0x14, 0xe);
 			while (!req.complete)
diff --git a/include/uapi/linux/pmu.h b/include/uapi/linux/pmu.h
index 89cb1acea93a..e128f609281a 100644
--- a/include/uapi/linux/pmu.h
+++ b/include/uapi/linux/pmu.h
@@ -93,7 +93,7 @@ enum {
 	PMU_HEATHROW_BASED,	/* PowerBook G3 series */
 	PMU_PADDINGTON_BASED,	/* 1999 PowerBook G3 */
 	PMU_KEYLARGO_BASED,	/* Core99 motherboard (PMU99) */
-	PMU_68K_V1,		/* 68K PMU, version 1 */
+	PMU_68K_V1,		/* Unused/deprecated */
 	PMU_68K_V2, 		/* 68K PMU, version 2 */
 };
 
-- 
cgit v1.2.3


From ebd722275f9cfc6752e29d2412fa3816ca05764b Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Mon, 2 Jul 2018 04:21:19 -0400
Subject: macintosh/via-pmu: Replace via-pmu68k driver with via-pmu driver

Now that the PowerMac via-pmu driver supports m68k PowerBooks,
switch over to that driver and remove the via-pmu68k driver.

Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/m68k/configs/mac_defconfig   |   2 +-
 arch/m68k/configs/multi_defconfig |   2 +-
 arch/m68k/mac/config.c            |   2 +-
 arch/m68k/mac/misc.c              |  48 +--
 drivers/macintosh/Kconfig         |  13 +-
 drivers/macintosh/Makefile        |   1 -
 drivers/macintosh/adb.c           |   2 +-
 drivers/macintosh/via-pmu68k.c    | 846 --------------------------------------
 include/uapi/linux/pmu.h          |   2 +-
 9 files changed, 14 insertions(+), 904 deletions(-)
 delete mode 100644 drivers/macintosh/via-pmu68k.c

(limited to 'include/uapi/linux')

diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index b52e597899eb..087ca15e32f1 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -369,7 +369,7 @@ CONFIG_TCM_PSCSI=m
 CONFIG_ADB=y
 CONFIG_ADB_MACII=y
 CONFIG_ADB_IOP=y
-CONFIG_ADB_PMU68K=y
+CONFIG_ADB_PMU=y
 CONFIG_ADB_CUDA=y
 CONFIG_INPUT_ADBHID=y
 CONFIG_MAC_EMUMOUSEBTN=y
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 2a84eeec5b02..3f9334084d55 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -402,7 +402,7 @@ CONFIG_TCM_PSCSI=m
 CONFIG_ADB=y
 CONFIG_ADB_MACII=y
 CONFIG_ADB_IOP=y
-CONFIG_ADB_PMU68K=y
+CONFIG_ADB_PMU=y
 CONFIG_ADB_CUDA=y
 CONFIG_INPUT_ADBHID=y
 CONFIG_MAC_EMUMOUSEBTN=y
diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c
index e522307db47c..92e80cf0d8aa 100644
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@@ -891,7 +891,7 @@ static void __init mac_identify(void)
 #ifdef CONFIG_ADB_CUDA
 	find_via_cuda();
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
 	find_via_pmu();
 #endif
 }
diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c
index 7ccb799eeb57..28090a44fa09 100644
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -85,7 +85,7 @@ static void cuda_write_pram(int offset, __u8 data)
 }
 #endif /* CONFIG_ADB_CUDA */
 
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
 static long pmu_read_time(void)
 {
 	struct adb_request req;
@@ -136,7 +136,7 @@ static void pmu_write_pram(int offset, __u8 data)
 	while (!req.complete)
 		pmu_poll();
 }
-#endif /* CONFIG_ADB_PMU68K */
+#endif /* CONFIG_ADB_PMU */
 
 /*
  * VIA PRAM/RTC access routines
@@ -367,38 +367,6 @@ static void cuda_shutdown(void)
 }
 #endif /* CONFIG_ADB_CUDA */
 
-#ifdef CONFIG_ADB_PMU68K
-
-void pmu_restart(void)
-{
-	struct adb_request req;
-	if (pmu_request(&req, NULL,
-			2, PMU_SET_INTR_MASK, PMU_INT_ADB|PMU_INT_TICK) < 0)
-		return;
-	while (!req.complete)
-		pmu_poll();
-	if (pmu_request(&req, NULL, 1, PMU_RESET) < 0)
-		return;
-	while (!req.complete)
-		pmu_poll();
-}
-
-void pmu_shutdown(void)
-{
-	struct adb_request req;
-	if (pmu_request(&req, NULL,
-			2, PMU_SET_INTR_MASK, PMU_INT_ADB|PMU_INT_TICK) < 0)
-		return;
-	while (!req.complete)
-		pmu_poll();
-	if (pmu_request(&req, NULL, 5, PMU_SHUTDOWN, 'M', 'A', 'T', 'T') < 0)
-		return;
-	while (!req.complete)
-		pmu_poll();
-}
-
-#endif
-
 /*
  *-------------------------------------------------------------------
  * Below this point are the generic routines; they'll dispatch to the
@@ -423,7 +391,7 @@ void mac_pram_read(int offset, __u8 *buffer, int len)
 		func = cuda_read_pram;
 		break;
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
 	case MAC_ADB_PB2:
 		func = pmu_read_pram;
 		break;
@@ -453,7 +421,7 @@ void mac_pram_write(int offset, __u8 *buffer, int len)
 		func = cuda_write_pram;
 		break;
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
 	case MAC_ADB_PB2:
 		func = pmu_write_pram;
 		break;
@@ -477,7 +445,7 @@ void mac_poweroff(void)
 	           macintosh_config->adb_type == MAC_ADB_CUDA) {
 		cuda_shutdown();
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
 	} else if (macintosh_config->adb_type == MAC_ADB_PB2) {
 		pmu_shutdown();
 #endif
@@ -518,7 +486,7 @@ void mac_reset(void)
 	           macintosh_config->adb_type == MAC_ADB_CUDA) {
 		cuda_restart();
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
 	} else if (macintosh_config->adb_type == MAC_ADB_PB2) {
 		pmu_restart();
 #endif
@@ -670,7 +638,7 @@ int mac_hwclk(int op, struct rtc_time *t)
 			now = cuda_read_time();
 			break;
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
 		case MAC_ADB_PB2:
 			now = pmu_read_time();
 			break;
@@ -706,7 +674,7 @@ int mac_hwclk(int op, struct rtc_time *t)
 			cuda_write_time(now);
 			break;
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
 		case MAC_ADB_PB2:
 			pmu_write_time(now);
 			break;
diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index 26abae4c899d..47c350cdfb12 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -39,17 +39,6 @@ config ADB_IOP
 	  <http://www.angelfire.com/ca2/dev68k/iopdesc.html> to enable direct
 	  support for it, say 'Y' here.
 
-config ADB_PMU68K
-	bool "Include PMU (Powerbook) ADB driver"
-	depends on ADB && MAC
-	help
-	  Say Y here if want your kernel to support the m68k based Powerbooks.
-	  This includes the PowerBook 140, PowerBook 145, PowerBook 150,
-	  PowerBook 160, PowerBook 165, PowerBook 165c, PowerBook 170,
-	  PowerBook 180, PowerBook, 180c, PowerBook 190cs, PowerBook 520,
-	  PowerBook Duo 210, PowerBook Duo 230, PowerBook Duo 250,
-	  PowerBook Duo 270c, PowerBook Duo 280 and PowerBook Duo 280c.
-
 # we want to change this to something like CONFIG_SYSCTRL_CUDA/PMU
 config ADB_CUDA
 	bool "Support for Cuda/Egret based Macs and PowerMacs"
@@ -66,7 +55,7 @@ config ADB_CUDA
 
 config ADB_PMU
 	bool "Support for PMU based PowerMacs and PowerBooks"
-	depends on PPC_PMAC
+	depends on PPC_PMAC || MAC
 	help
 	  On PowerBooks, iBooks, and recent iMacs and Power Macintoshes, the
 	  PMU is an embedded microprocessor whose primary function is to
diff --git a/drivers/macintosh/Makefile b/drivers/macintosh/Makefile
index ee803638e595..49819b1b6f20 100644
--- a/drivers/macintosh/Makefile
+++ b/drivers/macintosh/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_PMAC_SMU)		+= smu.o
 obj-$(CONFIG_ADB)		+= adb.o
 obj-$(CONFIG_ADB_MACII)		+= via-macii.o
 obj-$(CONFIG_ADB_IOP)		+= adb-iop.o
-obj-$(CONFIG_ADB_PMU68K)	+= via-pmu68k.o
 obj-$(CONFIG_ADB_MACIO)		+= macio-adb.o
 
 obj-$(CONFIG_THERM_WINDTUNNEL)	+= therm_windtunnel.o
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index 4c8097e0e6fe..76e98f0f7a3e 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -65,7 +65,7 @@ static struct adb_driver *adb_driver_list[] = {
 #ifdef CONFIG_ADB_IOP
 	&adb_iop_driver,
 #endif
-#if defined(CONFIG_ADB_PMU) || defined(CONFIG_ADB_PMU68K)
+#ifdef CONFIG_ADB_PMU
 	&via_pmu_driver,
 #endif
 #ifdef CONFIG_ADB_MACIO
diff --git a/drivers/macintosh/via-pmu68k.c b/drivers/macintosh/via-pmu68k.c
deleted file mode 100644
index bec8e1837d7d..000000000000
--- a/drivers/macintosh/via-pmu68k.c
+++ /dev/null
@@ -1,846 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Device driver for the PMU on 68K-based Apple PowerBooks
- *
- * The VIA (versatile interface adapter) interfaces to the PMU,
- * a 6805 microprocessor core whose primary function is to control
- * battery charging and system power on the PowerBooks.
- * The PMU also controls the ADB (Apple Desktop Bus) which connects
- * to the keyboard and mouse, as well as the non-volatile RAM
- * and the RTC (real time clock) chip.
- *
- * Adapted for 68K PMU by Joshua M. Thompson
- *
- * Based largely on the PowerMac PMU code by Paul Mackerras and
- * Fabio Riccardi.
- *
- * Also based on the PMU driver from MkLinux by Apple Computer, Inc.
- * and the Open Software Foundation, Inc.
- */
-
-#include <stdarg.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/miscdevice.h>
-#include <linux/blkdev.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-
-#include <linux/adb.h>
-#include <linux/pmu.h>
-#include <linux/cuda.h>
-
-#include <asm/macintosh.h>
-#include <asm/macints.h>
-#include <asm/mac_via.h>
-
-#include <asm/pgtable.h>
-#include <asm/irq.h>
-#include <linux/uaccess.h>
-
-/* Misc minor number allocated for /dev/pmu */
-#define PMU_MINOR	154
-
-/* VIA registers - spaced 0x200 bytes apart */
-#define RS		0x200		/* skip between registers */
-#define B		0		/* B-side data */
-#define A		RS		/* A-side data */
-#define DIRB		(2*RS)		/* B-side direction (1=output) */
-#define DIRA		(3*RS)		/* A-side direction (1=output) */
-#define T1CL		(4*RS)		/* Timer 1 ctr/latch (low 8 bits) */
-#define T1CH		(5*RS)		/* Timer 1 counter (high 8 bits) */
-#define T1LL		(6*RS)		/* Timer 1 latch (low 8 bits) */
-#define T1LH		(7*RS)		/* Timer 1 latch (high 8 bits) */
-#define T2CL		(8*RS)		/* Timer 2 ctr/latch (low 8 bits) */
-#define T2CH		(9*RS)		/* Timer 2 counter (high 8 bits) */
-#define SR		(10*RS)		/* Shift register */
-#define ACR		(11*RS)		/* Auxiliary control register */
-#define PCR		(12*RS)		/* Peripheral control register */
-#define IFR		(13*RS)		/* Interrupt flag register */
-#define IER		(14*RS)		/* Interrupt enable register */
-#define ANH		(15*RS)		/* A-side data, no handshake */
-
-/* Bits in B data register: both active low */
-#define TACK		0x02		/* Transfer acknowledge (input) */
-#define TREQ		0x04		/* Transfer request (output) */
-
-/* Bits in ACR */
-#define SR_CTRL		0x1c		/* Shift register control bits */
-#define SR_EXT		0x0c		/* Shift on external clock */
-#define SR_OUT		0x10		/* Shift out if 1 */
-
-/* Bits in IFR and IER */
-#define SR_INT		0x04		/* Shift register full/empty */
-#define CB1_INT		0x10		/* transition on CB1 input */
-
-static enum pmu_state {
-	idle,
-	sending,
-	intack,
-	reading,
-	reading_intr,
-} pmu_state;
-
-static struct adb_request *current_req;
-static struct adb_request *last_req;
-static struct adb_request *req_awaiting_reply;
-static unsigned char interrupt_data[32];
-static unsigned char *reply_ptr;
-static int data_index;
-static int data_len;
-static int adb_int_pending;
-static int pmu_adb_flags;
-static int adb_dev_map;
-static struct adb_request bright_req_1, bright_req_2, bright_req_3;
-static int pmu_kind = PMU_UNKNOWN;
-static int pmu_fully_inited;
-
-int asleep;
-
-static int pmu_probe(void);
-static int pmu_init(void);
-static void pmu_start(void);
-static irqreturn_t pmu_interrupt(int irq, void *arg);
-static int pmu_send_request(struct adb_request *req, int sync);
-static int pmu_autopoll(int devs);
-void pmu_poll(void);
-static int pmu_reset_bus(void);
-
-static int init_pmu(void);
-static void pmu_start(void);
-static void send_byte(int x);
-static void recv_byte(void);
-static void pmu_done(struct adb_request *req);
-static void pmu_handle_data(unsigned char *data, int len);
-static void set_volume(int level);
-static void pmu_enable_backlight(int on);
-static void pmu_set_brightness(int level);
-
-struct adb_driver via_pmu_driver = {
-	.name         = "68K PMU",
-	.probe        = pmu_probe,
-	.init         = pmu_init,
-	.send_request = pmu_send_request,
-	.autopoll     = pmu_autopoll,
-	.poll         = pmu_poll,
-	.reset_bus    = pmu_reset_bus,
-};
-
-/*
- * This table indicates for each PMU opcode:
- * - the number of data bytes to be sent with the command, or -1
- *   if a length byte should be sent,
- * - the number of response bytes which the PMU will return, or
- *   -1 if it will send a length byte.
- */
-static s8 pmu_data_len[256][2] = {
-/*	   0	   1	   2	   3	   4	   5	   6	   7  */
-/*00*/	{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*08*/	{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
-/*10*/	{ 1, 0},{ 1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*18*/	{ 0, 1},{ 0, 1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{ 0, 0},
-/*20*/	{-1, 0},{ 0, 0},{ 2, 0},{ 1, 0},{ 1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*28*/	{ 0,-1},{ 0,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{ 0,-1},
-/*30*/	{ 4, 0},{20, 0},{-1, 0},{ 3, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*38*/	{ 0, 4},{ 0,20},{ 2,-1},{ 2, 1},{ 3,-1},{-1,-1},{-1,-1},{ 4, 0},
-/*40*/	{ 1, 0},{ 1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*48*/	{ 0, 1},{ 0, 1},{-1,-1},{ 1, 0},{ 1, 0},{-1,-1},{-1,-1},{-1,-1},
-/*50*/	{ 1, 0},{ 0, 0},{ 2, 0},{ 2, 0},{-1, 0},{ 1, 0},{ 3, 0},{ 1, 0},
-/*58*/	{ 0, 1},{ 1, 0},{ 0, 2},{ 0, 2},{ 0,-1},{-1,-1},{-1,-1},{-1,-1},
-/*60*/	{ 2, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*68*/	{ 0, 3},{ 0, 3},{ 0, 2},{ 0, 8},{ 0,-1},{ 0,-1},{-1,-1},{-1,-1},
-/*70*/	{ 1, 0},{ 1, 0},{ 1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*78*/	{ 0,-1},{ 0,-1},{-1,-1},{-1,-1},{-1,-1},{ 5, 1},{ 4, 1},{ 4, 1},
-/*80*/	{ 4, 0},{-1, 0},{ 0, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*88*/	{ 0, 5},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
-/*90*/	{ 1, 0},{ 2, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*98*/	{ 0, 1},{ 0, 1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
-/*a0*/	{ 2, 0},{ 2, 0},{ 2, 0},{ 4, 0},{-1, 0},{ 0, 0},{-1, 0},{-1, 0},
-/*a8*/	{ 1, 1},{ 1, 0},{ 3, 0},{ 2, 0},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
-/*b0*/	{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*b8*/	{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
-/*c0*/	{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*c8*/	{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
-/*d0*/	{ 0, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*d8*/	{ 1, 1},{ 1, 1},{-1,-1},{-1,-1},{ 0, 1},{ 0,-1},{-1,-1},{-1,-1},
-/*e0*/	{-1, 0},{ 4, 0},{ 0, 1},{-1, 0},{-1, 0},{ 4, 0},{-1, 0},{-1, 0},
-/*e8*/	{ 3,-1},{-1,-1},{ 0, 1},{-1,-1},{ 0,-1},{-1,-1},{-1,-1},{ 0, 0},
-/*f0*/	{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},{-1, 0},
-/*f8*/	{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},{-1,-1},
-};
-
-int __init find_via_pmu(void)
-{
-	switch (macintosh_config->adb_type) {
-	case MAC_ADB_PB2:
-		pmu_kind = PMU_68K_V2;
-		break;
-	default:
-		pmu_kind = PMU_UNKNOWN;
-		return -ENODEV;
-	}
-
-	pmu_state = idle;
-
-	if (!init_pmu())
-		goto fail_init;
-
-	pr_info("adb: PMU 68K driver v0.5 for Unified ADB\n");
-
-	return 1;
-
-fail_init:
-	pmu_kind = PMU_UNKNOWN;
-	return 0;
-}
-
-static int pmu_probe(void)
-{
-	if (pmu_kind == PMU_UNKNOWN)
-		return -ENODEV;
-	return 0;
-}
-
-static int pmu_init(void)
-{
-	if (pmu_kind == PMU_UNKNOWN)
-		return -ENODEV;
-	return 0;
-}
-
-static int __init via_pmu_start(void)
-{
-	if (pmu_kind == PMU_UNKNOWN)
-		return -ENODEV;
-
-	if (request_irq(IRQ_MAC_ADB_SR, pmu_interrupt, 0, "PMU_SR",
-			pmu_interrupt)) {
-		pr_err("%s: can't get SR irq\n", __func__);
-		return -ENODEV;
-	}
-	if (request_irq(IRQ_MAC_ADB_CL, pmu_interrupt, 0, "PMU_CL",
-			pmu_interrupt)) {
-		pr_err("%s: can't get CL irq\n", __func__);
-		free_irq(IRQ_MAC_ADB_SR, pmu_interrupt);
-		return -ENODEV;
-	}
-
-	pmu_fully_inited = 1;
-
-	/* Enable backlight */
-	pmu_enable_backlight(1);
-
-	return 0;
-}
-
-arch_initcall(via_pmu_start);
-
-static int __init init_pmu(void)
-{
-	int timeout;
-	volatile struct adb_request req;
-
-	via2[B] |= TREQ;				/* negate TREQ */
-	via2[DIRB] = (via2[DIRB] | TREQ) & ~TACK;	/* TACK in, TREQ out */
-
-	pmu_request((struct adb_request *) &req, NULL, 2, PMU_SET_INTR_MASK, PMU_INT_ADB);
-	timeout =  100000;
-	while (!req.complete) {
-		if (--timeout < 0) {
-			printk(KERN_ERR "pmu_init: no response from PMU\n");
-			return -EAGAIN;
-		}
-		udelay(10);
-		pmu_poll();
-	}
-
-	/* ack all pending interrupts */
-	timeout = 100000;
-	interrupt_data[0] = 1;
-	while (interrupt_data[0] || pmu_state != idle) {
-		if (--timeout < 0) {
-			printk(KERN_ERR "pmu_init: timed out acking intrs\n");
-			return -EAGAIN;
-		}
-		if (pmu_state == idle) {
-			adb_int_pending = 1;
-			pmu_interrupt(0, NULL);
-		}
-		pmu_poll();
-		udelay(10);
-	}
-
-	pmu_request((struct adb_request *) &req, NULL, 2, PMU_SET_INTR_MASK,
-			PMU_INT_ADB_AUTO|PMU_INT_SNDBRT|PMU_INT_ADB);
-	timeout =  100000;
-	while (!req.complete) {
-		if (--timeout < 0) {
-			printk(KERN_ERR "pmu_init: no response from PMU\n");
-			return -EAGAIN;
-		}
-		udelay(10);
-		pmu_poll();
-	}
-
-	bright_req_1.complete = 1;
-	bright_req_2.complete = 1;
-	bright_req_3.complete = 1;
-
-	return 1;
-}
-
-int
-pmu_get_model(void)
-{
-	return pmu_kind;
-}
-
-/* Send an ADB command */
-static int 
-pmu_send_request(struct adb_request *req, int sync)
-{
-    int i, ret;
-
-    if (!pmu_fully_inited)
-    {
- 	req->complete = 1;
-   	return -ENXIO;
-   }
-
-    ret = -EINVAL;
-	
-    switch (req->data[0]) {
-    case PMU_PACKET:
-		for (i = 0; i < req->nbytes - 1; ++i)
-			req->data[i] = req->data[i+1];
-		--req->nbytes;
-		if (pmu_data_len[req->data[0]][1] != 0) {
-			req->reply[0] = ADB_RET_OK;
-			req->reply_len = 1;
-		} else
-			req->reply_len = 0;
-		ret = pmu_queue_request(req);
-		break;
-    case CUDA_PACKET:
-		switch (req->data[1]) {
-		case CUDA_GET_TIME:
-			if (req->nbytes != 2)
-				break;
-			req->data[0] = PMU_READ_RTC;
-			req->nbytes = 1;
-			req->reply_len = 3;
-			req->reply[0] = CUDA_PACKET;
-			req->reply[1] = 0;
-			req->reply[2] = CUDA_GET_TIME;
-			ret = pmu_queue_request(req);
-			break;
-		case CUDA_SET_TIME:
-			if (req->nbytes != 6)
-				break;
-			req->data[0] = PMU_SET_RTC;
-			req->nbytes = 5;
-			for (i = 1; i <= 4; ++i)
-				req->data[i] = req->data[i+1];
-			req->reply_len = 3;
-			req->reply[0] = CUDA_PACKET;
-			req->reply[1] = 0;
-			req->reply[2] = CUDA_SET_TIME;
-			ret = pmu_queue_request(req);
-			break;
-		case CUDA_GET_PRAM:
-			if (req->nbytes != 4)
-				break;
-			req->data[0] = PMU_READ_NVRAM;
-			req->data[1] = req->data[2];
-			req->data[2] = req->data[3];
-			req->nbytes = 3;
-			req->reply_len = 3;
-			req->reply[0] = CUDA_PACKET;
-			req->reply[1] = 0;
-			req->reply[2] = CUDA_GET_PRAM;
-			ret = pmu_queue_request(req);
-			break;
-		case CUDA_SET_PRAM:
-			if (req->nbytes != 5)
-				break;
-			req->data[0] = PMU_WRITE_NVRAM;
-			req->data[1] = req->data[2];
-			req->data[2] = req->data[3];
-			req->data[3] = req->data[4];
-			req->nbytes = 4;
-			req->reply_len = 3;
-			req->reply[0] = CUDA_PACKET;
-			req->reply[1] = 0;
-			req->reply[2] = CUDA_SET_PRAM;
-			ret = pmu_queue_request(req);
-			break;
-		}
-		break;
-    case ADB_PACKET:
-		for (i = req->nbytes - 1; i > 1; --i)
-			req->data[i+2] = req->data[i];
-		req->data[3] = req->nbytes - 2;
-		req->data[2] = pmu_adb_flags;
-		/*req->data[1] = req->data[1];*/
-		req->data[0] = PMU_ADB_CMD;
-		req->nbytes += 2;
-		req->reply_expected = 1;
-		req->reply_len = 0;
-		ret = pmu_queue_request(req);
-		break;
-    }
-    if (ret)
-    {
-    	req->complete = 1;
-    	return ret;
-    }
-    	
-    if (sync) {
-	while (!req->complete)
-		pmu_poll();
-    }
-
-    return 0;
-}
-
-/* Enable/disable autopolling */
-static int 
-pmu_autopoll(int devs)
-{
-	struct adb_request req;
-
-	if (!pmu_fully_inited) return -ENXIO;
-
-	if (devs) {
-		adb_dev_map = devs;
-		pmu_request(&req, NULL, 5, PMU_ADB_CMD, 0, 0x86,
-			    adb_dev_map >> 8, adb_dev_map);
-		pmu_adb_flags = 2;
-	} else {
-		pmu_request(&req, NULL, 1, PMU_ADB_POLL_OFF);
-		pmu_adb_flags = 0;
-	}
-	while (!req.complete)
-		pmu_poll();
-	return 0;
-}
-
-/* Reset the ADB bus */
-static int 
-pmu_reset_bus(void)
-{
-	struct adb_request req;
-	long timeout;
-	int save_autopoll = adb_dev_map;
-
-	if (!pmu_fully_inited) return -ENXIO;
-
-	/* anyone got a better idea?? */
-	pmu_autopoll(0);
-
-	req.nbytes = 5;
-	req.done = NULL;
-	req.data[0] = PMU_ADB_CMD;
-	req.data[1] = 0;
-	req.data[2] = 3; /* ADB_BUSRESET ??? */
-	req.data[3] = 0;
-	req.data[4] = 0;
-	req.reply_len = 0;
-	req.reply_expected = 1;
-	if (pmu_queue_request(&req) != 0)
-	{
-		printk(KERN_ERR "pmu_adb_reset_bus: pmu_queue_request failed\n");
-		return -EIO;
-	}
-	while (!req.complete)
-		pmu_poll();
-	timeout = 100000;
-	while (!req.complete) {
-		if (--timeout < 0) {
-			printk(KERN_ERR "pmu_adb_reset_bus (reset): no response from PMU\n");
-			return -EIO;
-		}
-		udelay(10);
-		pmu_poll();
-	}
-
-	if (save_autopoll != 0)
-		pmu_autopoll(save_autopoll);
-		
-	return 0;
-}
-
-/* Construct and send a pmu request */
-int 
-pmu_request(struct adb_request *req, void (*done)(struct adb_request *),
-	    int nbytes, ...)
-{
-	va_list list;
-	int i;
-
-	if (nbytes < 0 || nbytes > 32) {
-		printk(KERN_ERR "pmu_request: bad nbytes (%d)\n", nbytes);
-		req->complete = 1;
-		return -EINVAL;
-	}
-	req->nbytes = nbytes;
-	req->done = done;
-	va_start(list, nbytes);
-	for (i = 0; i < nbytes; ++i)
-		req->data[i] = va_arg(list, int);
-	va_end(list);
-	if (pmu_data_len[req->data[0]][1] != 0) {
-		req->reply[0] = ADB_RET_OK;
-		req->reply_len = 1;
-	} else
-		req->reply_len = 0;
-	req->reply_expected = 0;
-	return pmu_queue_request(req);
-}
-
-int
-pmu_queue_request(struct adb_request *req)
-{
-	unsigned long flags;
-	int nsend;
-
-	if (req->nbytes <= 0) {
-		req->complete = 1;
-		return 0;
-	}
-	nsend = pmu_data_len[req->data[0]][0];
-	if (nsend >= 0 && req->nbytes != nsend + 1) {
-		req->complete = 1;
-		return -EINVAL;
-	}
-
-	req->next = NULL;
-	req->sent = 0;
-	req->complete = 0;
-	local_irq_save(flags);
-
-	if (current_req != 0) {
-		last_req->next = req;
-		last_req = req;
-	} else {
-		current_req = req;
-		last_req = req;
-		if (pmu_state == idle)
-			pmu_start();
-	}
-
-	local_irq_restore(flags);
-	return 0;
-}
-
-static void 
-send_byte(int x)
-{
-	via1[ACR] |= SR_CTRL;
-	via1[SR] = x;
-	via2[B] &= ~TREQ;		/* assert TREQ */
-}
-
-static void 
-recv_byte(void)
-{
-	char c;
-
-	via1[ACR] = (via1[ACR] | SR_EXT) & ~SR_OUT;
-	c = via1[SR];		/* resets SR */
-	via2[B] &= ~TREQ;
-}
-
-static void 
-pmu_start(void)
-{
-	unsigned long flags;
-	struct adb_request *req;
-
-	/* assert pmu_state == idle */
-	/* get the packet to send */
-	local_irq_save(flags);
-	req = current_req;
-	if (req == 0 || pmu_state != idle
-	    || (req->reply_expected && req_awaiting_reply))
-		goto out;
-
-	pmu_state = sending;
-	data_index = 1;
-	data_len = pmu_data_len[req->data[0]][0];
-
-	/* set the shift register to shift out and send a byte */
-	send_byte(req->data[0]);
-
-out:
-	local_irq_restore(flags);
-}
-
-void 
-pmu_poll(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	if (via1[IFR] & SR_INT) {
-		via1[IFR] = SR_INT;
-		pmu_interrupt(IRQ_MAC_ADB_SR, NULL);
-	}
-	if (via1[IFR] & CB1_INT) {
-		via1[IFR] = CB1_INT;
-		pmu_interrupt(IRQ_MAC_ADB_CL, NULL);
-	}
-	local_irq_restore(flags);
-}
-
-static irqreturn_t
-pmu_interrupt(int irq, void *dev_id)
-{
-	struct adb_request *req;
-	int timeout, bite = 0;	/* to prevent compiler warning */
-
-#if 0
-	printk("pmu_interrupt: irq %d state %d acr %02X, b %02X data_index %d/%d adb_int_pending %d\n",
-		irq, pmu_state, (uint) via1[ACR], (uint) via2[B], data_index, data_len, adb_int_pending);
-#endif
-
-	if (irq == IRQ_MAC_ADB_CL) {		/* CB1 interrupt */
-		adb_int_pending = 1;
-	} else if (irq == IRQ_MAC_ADB_SR) {	/* SR interrupt  */
-		if (via2[B] & TACK) {
-			printk(KERN_DEBUG "PMU: SR_INT but ack still high! (%x)\n", via2[B]);
-		}
-
-		/* if reading grab the byte */
-		if ((via1[ACR] & SR_OUT) == 0) bite = via1[SR];
-
-		/* reset TREQ and wait for TACK to go high */
-		via2[B] |= TREQ;
-		timeout = 3200;
-		while (!(via2[B] & TACK)) {
-			if (--timeout < 0) {
-				printk(KERN_ERR "PMU not responding (!ack)\n");
-				goto finish;
-			}
-			udelay(10);
-		}
-
-		switch (pmu_state) {
-		case sending:
-			req = current_req;
-			if (data_len < 0) {
-				data_len = req->nbytes - 1;
-				send_byte(data_len);
-				break;
-			}
-			if (data_index <= data_len) {
-				send_byte(req->data[data_index++]);
-				break;
-			}
-			req->sent = 1;
-			data_len = pmu_data_len[req->data[0]][1];
-			if (data_len == 0) {
-				pmu_state = idle;
-				current_req = req->next;
-				if (req->reply_expected)
-					req_awaiting_reply = req;
-				else
-					pmu_done(req);
-			} else {
-				pmu_state = reading;
-				data_index = 0;
-				reply_ptr = req->reply + req->reply_len;
-				recv_byte();
-			}
-			break;
-
-		case intack:
-			data_index = 0;
-			data_len = -1;
-			pmu_state = reading_intr;
-			reply_ptr = interrupt_data;
-			recv_byte();
-			break;
-
-		case reading:
-		case reading_intr:
-			if (data_len == -1) {
-				data_len = bite;
-				if (bite > 32)
-					printk(KERN_ERR "PMU: bad reply len %d\n",
-					       bite);
-			} else {
-				reply_ptr[data_index++] = bite;
-			}
-			if (data_index < data_len) {
-				recv_byte();
-				break;
-			}
-
-			if (pmu_state == reading_intr) {
-				pmu_handle_data(interrupt_data, data_index);
-			} else {
-				req = current_req;
-				current_req = req->next;
-				req->reply_len += data_index;
-				pmu_done(req);
-			}
-			pmu_state = idle;
-
-			break;
-
-		default:
-			printk(KERN_ERR "pmu_interrupt: unknown state %d?\n",
-			       pmu_state);
-		}
-	}
-finish:
-	if (pmu_state == idle) {
-		if (adb_int_pending) {
-			pmu_state = intack;
-			send_byte(PMU_INT_ACK);
-			adb_int_pending = 0;
-		} else if (current_req) {
-			pmu_start();
-		}
-	}
-
-#if 0
-	printk("pmu_interrupt: exit state %d acr %02X, b %02X data_index %d/%d adb_int_pending %d\n",
-		pmu_state, (uint) via1[ACR], (uint) via2[B], data_index, data_len, adb_int_pending);
-#endif
-	return IRQ_HANDLED;
-}
-
-static void 
-pmu_done(struct adb_request *req)
-{
-	req->complete = 1;
-	if (req->done)
-		(*req->done)(req);
-}
-
-/* Interrupt data could be the result data from an ADB cmd */
-static void 
-pmu_handle_data(unsigned char *data, int len)
-{
-	static int show_pmu_ints = 1;
-
-	asleep = 0;
-	if (len < 1) {
-		adb_int_pending = 0;
-		return;
-	}
-	if (data[0] & PMU_INT_ADB) {
-		if ((data[0] & PMU_INT_ADB_AUTO) == 0) {
-			struct adb_request *req = req_awaiting_reply;
-			if (req == 0) {
-				printk(KERN_ERR "PMU: extra ADB reply\n");
-				return;
-			}
-			req_awaiting_reply = NULL;
-			if (len <= 2)
-				req->reply_len = 0;
-			else {
-				memcpy(req->reply, data + 1, len - 1);
-				req->reply_len = len - 1;
-			}
-			pmu_done(req);
-		} else {
-			adb_input(data+1, len-1, 1);
-		}
-	} else {
-		if (data[0] == 0x08 && len == 3) {
-			/* sound/brightness buttons pressed */
-			pmu_set_brightness(data[1] >> 3);
-			set_volume(data[2]);
-		} else if (show_pmu_ints
-			   && !(data[0] == PMU_INT_TICK && len == 1)) {
-			int i;
-			printk(KERN_DEBUG "pmu intr");
-			for (i = 0; i < len; ++i)
-				printk(" %.2x", data[i]);
-			printk("\n");
-		}
-	}
-}
-
-static int backlight_level = -1;
-static int backlight_enabled = 0;
-
-#define LEVEL_TO_BRIGHT(lev)	((lev) < 1? 0x7f: 0x4a - ((lev) << 1))
-
-static void 
-pmu_enable_backlight(int on)
-{
-	struct adb_request req;
-
-	if (on) {
-	    /* first call: get current backlight value */
-	    if (backlight_level < 0) {
-		switch(pmu_kind) {
-		    case PMU_68K_V2:
-			pmu_request(&req, NULL, 3, PMU_READ_NVRAM, 0x14, 0xe);
-			while (!req.complete)
-				pmu_poll();
-			printk(KERN_DEBUG "pmu: nvram returned bright: %d\n", (int)req.reply[1]);
-			backlight_level = req.reply[1];
-			break;
-		    default:
-		        backlight_enabled = 0;
-		        return;
-		}
-	    }
-	    pmu_request(&req, NULL, 2, PMU_BACKLIGHT_BRIGHT,
-	    	LEVEL_TO_BRIGHT(backlight_level));
-	    while (!req.complete)
-		pmu_poll();
-	}
-	pmu_request(&req, NULL, 2, PMU_POWER_CTRL,
-	    PMU_POW_BACKLIGHT | (on ? PMU_POW_ON : PMU_POW_OFF));
-	while (!req.complete)
-		pmu_poll();
-	backlight_enabled = on;
-}
-
-static void 
-pmu_set_brightness(int level)
-{
-	int bright;
-
-	backlight_level = level;
-	bright = LEVEL_TO_BRIGHT(level);
-	if (!backlight_enabled)
-		return;
-	if (bright_req_1.complete)
-		pmu_request(&bright_req_1, NULL, 2, PMU_BACKLIGHT_BRIGHT,
-		    bright);
-	if (bright_req_2.complete)
-		pmu_request(&bright_req_2, NULL, 2, PMU_POWER_CTRL,
-		    PMU_POW_BACKLIGHT | (bright < 0x7f ? PMU_POW_ON : PMU_POW_OFF));
-}
-
-void 
-pmu_enable_irled(int on)
-{
-	struct adb_request req;
-
-	pmu_request(&req, NULL, 2, PMU_POWER_CTRL, PMU_POW_IRLED |
-	    (on ? PMU_POW_ON : PMU_POW_OFF));
-	while (!req.complete)
-		pmu_poll();
-}
-
-static void 
-set_volume(int level)
-{
-}
-
-int
-pmu_present(void)
-{
-	return (pmu_kind != PMU_UNKNOWN);
-}
diff --git a/include/uapi/linux/pmu.h b/include/uapi/linux/pmu.h
index e128f609281a..97256f90e6df 100644
--- a/include/uapi/linux/pmu.h
+++ b/include/uapi/linux/pmu.h
@@ -94,7 +94,7 @@ enum {
 	PMU_PADDINGTON_BASED,	/* 1999 PowerBook G3 */
 	PMU_KEYLARGO_BASED,	/* Core99 motherboard (PMU99) */
 	PMU_68K_V1,		/* Unused/deprecated */
-	PMU_68K_V2, 		/* 68K PMU, version 2 */
+	PMU_68K_V2,		/* Unused/deprecated */
 };
 
 /* PMU PMU_POWER_EVENTS commands */
-- 
cgit v1.2.3


From ba113c3aa79a7f941ac162d05a3620bdc985c58d Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 31 Jul 2018 17:46:21 -0700
Subject: tcp: add data bytes sent stats

Introduce a new TCP stat to record the number of bytes sent
(RFC4898 tcpEStatsPerfHCDataOctetsOut) and expose it in both tcp_info
(TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS).

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 3 +++
 include/uapi/linux/tcp.h | 4 +++-
 net/ipv4/tcp.c           | 6 ++++++
 net/ipv4/tcp_output.c    | 1 +
 4 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 58a8d7d71354..d0798dcd2cab 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -181,6 +181,9 @@ struct tcp_sock {
 	u32	data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
 				 * total number of data segments sent.
 				 */
+	u64	bytes_sent;	/* RFC4898 tcpEStatsPerfHCDataOctetsOut
+				 * total number of data bytes sent.
+				 */
 	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index e3f6ed8a7064..1c70ed287c3b 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -235,6 +235,8 @@ struct tcp_info {
 
 	__u32	tcpi_delivered;
 	__u32	tcpi_delivered_ce;
+
+	__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
@@ -257,7 +259,7 @@ enum {
 	TCP_NLA_SND_SSTHRESH,	/* Slow start size threshold */
 	TCP_NLA_DELIVERED,	/* Data pkts delivered incl. out-of-order */
 	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */
-
+	TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 27bbe6a792b7..873cb9968ff5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2594,6 +2594,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	sk->sk_rx_dst = NULL;
 	tcp_saved_syn_free(tp);
 	tp->compressed_ack = 0;
+	tp->bytes_sent = 0;
 
 	/* Clean up fastopen related fields */
 	tcp_free_fastopen_req(tp);
@@ -3201,6 +3202,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		info->tcpi_delivery_rate = rate64;
 	info->tcpi_delivered = tp->delivered;
 	info->tcpi_delivered_ce = tp->delivered_ce;
+	info->tcpi_bytes_sent = tp->bytes_sent;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3225,6 +3227,7 @@ static size_t tcp_opt_stats_get_size(void)
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
 		0;
 }
 
@@ -3272,6 +3275,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
 	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
 
+	nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
+			  TCP_NLA_PAD);
+
 	return stats;
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 490df62f26d4..861531fe0e97 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1136,6 +1136,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	if (skb->len != tcp_header_size) {
 		tcp_event_data_sent(tp, sk);
 		tp->data_segs_out += tcp_skb_pcount(skb);
+		tp->bytes_sent += skb->len - tcp_header_size;
 		tcp_internal_pacing(sk, skb);
 	}
 
-- 
cgit v1.2.3


From fb31c9b9f6c85b1bad569ecedbde78d9e37cd87b Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 31 Jul 2018 17:46:22 -0700
Subject: tcp: add data bytes retransmitted stats

Introduce a new TCP stat to record the number of bytes retransmitted
(RFC4898 tcpEStatsPerfOctetsRetrans) and expose it in both tcp_info
(TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS).

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 3 +++
 include/uapi/linux/tcp.h | 2 ++
 net/ipv4/tcp.c           | 5 +++++
 net/ipv4/tcp_output.c    | 1 +
 4 files changed, 11 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d0798dcd2cab..fb67f9a51b95 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -333,6 +333,9 @@ struct tcp_sock {
 				 * the first SYN. */
 	u32	undo_marker;	/* snd_una upon a new recovery episode. */
 	int	undo_retrans;	/* number of undoable retransmissions. */
+	u64	bytes_retrans;	/* RFC4898 tcpEStatsPerfOctetsRetrans
+				 * Total data bytes retransmitted
+				 */
 	u32	total_retrans;	/* Total retransmits for entire connection */
 
 	u32	urg_seq;	/* Seq of received urgent pointer */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 1c70ed287c3b..c31f5100b744 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -237,6 +237,7 @@ struct tcp_info {
 	__u32	tcpi_delivered_ce;
 
 	__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
+	__u64	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
@@ -260,6 +261,7 @@ enum {
 	TCP_NLA_DELIVERED,	/* Data pkts delivered incl. out-of-order */
 	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */
 	TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */
+	TCP_NLA_BYTES_RETRANS,	/* Data bytes retransmitted */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 873cb9968ff5..5ed1be88e922 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tcp_saved_syn_free(tp);
 	tp->compressed_ack = 0;
 	tp->bytes_sent = 0;
+	tp->bytes_retrans = 0;
 
 	/* Clean up fastopen related fields */
 	tcp_free_fastopen_req(tp);
@@ -3203,6 +3204,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_delivered = tp->delivered;
 	info->tcpi_delivered_ce = tp->delivered_ce;
 	info->tcpi_bytes_sent = tp->bytes_sent;
+	info->tcpi_bytes_retrans = tp->bytes_retrans;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3228,6 +3230,7 @@ static size_t tcp_opt_stats_get_size(void)
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
 		0;
 }
 
@@ -3277,6 +3280,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 
 	nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
 			  TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
+			  TCP_NLA_PAD);
 
 	return stats;
 }
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 861531fe0e97..50cabf7656f3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2871,6 +2871,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
 	tp->total_retrans += segs;
+	tp->bytes_retrans += skb->len;
 
 	/* make sure skb->data is aligned on arches that require it
 	 * and check if ack-trimming & collapsing extended the headroom
-- 
cgit v1.2.3


From 7e10b6554ff2ce7f86d5d3eec3af5db8db482caa Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 31 Jul 2018 17:46:23 -0700
Subject: tcp: add dsack blocks received stats

Introduce a new TCP stat to record the number of DSACK blocks received
(RFC4989 tcpEStatsStackDSACKDups) and expose it in both tcp_info
(TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS).

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 3 +++
 include/uapi/linux/tcp.h | 2 ++
 net/ipv4/tcp.c           | 4 ++++
 net/ipv4/tcp_input.c     | 1 +
 4 files changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fb67f9a51b95..da6281c549a5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -188,6 +188,9 @@ struct tcp_sock {
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
 				 */
+	u32	dsack_dups;	/* RFC4898 tcpEStatsStackDSACKDups
+				 * total number of DSACK blocks received
+				 */
  	u32	snd_una;	/* First byte we want an ack for	*/
  	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index c31f5100b744..0e1c0aec0153 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -238,6 +238,7 @@ struct tcp_info {
 
 	__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
 	__u64	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */
+	__u32	tcpi_dsack_dups;     /* RFC4898 tcpEStatsStackDSACKDups */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
@@ -262,6 +263,7 @@ enum {
 	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */
 	TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */
 	TCP_NLA_BYTES_RETRANS,	/* Data bytes retransmitted */
+	TCP_NLA_DSACK_DUPS,	/* DSACK blocks received */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5ed1be88e922..d6232b598cae 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2596,6 +2596,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->compressed_ack = 0;
 	tp->bytes_sent = 0;
 	tp->bytes_retrans = 0;
+	tp->dsack_dups = 0;
 
 	/* Clean up fastopen related fields */
 	tcp_free_fastopen_req(tp);
@@ -3205,6 +3206,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_delivered_ce = tp->delivered_ce;
 	info->tcpi_bytes_sent = tp->bytes_sent;
 	info->tcpi_bytes_retrans = tp->bytes_retrans;
+	info->tcpi_dsack_dups = tp->dsack_dups;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3231,6 +3233,7 @@ static size_t tcp_opt_stats_get_size(void)
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
 		0;
 }
 
@@ -3282,6 +3285,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 			  TCP_NLA_PAD);
 	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
 			  TCP_NLA_PAD);
+	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
 
 	return stats;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d51fa358b2b1..fbc85ff7d71d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -874,6 +874,7 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
 {
 	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
 	tp->rack.dsack_seen = 1;
+	tp->dsack_dups++;
 }
 
 /* It's reordering when higher sequence was delivered (i.e. sacked) before
-- 
cgit v1.2.3


From 7ec65372ca534217b53fd208500cf7aac223a383 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 31 Jul 2018 17:46:24 -0700
Subject: tcp: add stat of data packet reordering events

Introduce a new TCP stats to record the number of reordering events seen
and expose it in both tcp_info (TCP_INFO) and opt_stats
(SOF_TIMESTAMPING_OPT_STATS).
Application can use this stats to track the frequency of the reordering
events in addition to the existing reordering stats which tracks the
magnitude of the latest reordering event.

Note: this new stats tracks reordering events triggered by ACKs, which
could often be fewer than the actual number of packets being delivered
out-of-order.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 4 ++--
 include/uapi/linux/tcp.h | 2 ++
 net/ipv4/tcp.c           | 4 ++++
 net/ipv4/tcp_input.c     | 3 ++-
 net/ipv4/tcp_recovery.c  | 2 +-
 5 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index da6281c549a5..263e37271afd 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -220,8 +220,7 @@ struct tcp_sock {
 #define TCP_RACK_RECOVERY_THRESH 16
 		u8 reo_wnd_persist:5, /* No. of recovery since last adj */
 		   dsack_seen:1, /* Whether DSACK seen after last adj */
-		   advanced:1,	 /* mstamp advanced since last lost marking */
-		   reord:1;	 /* reordering detected */
+		   advanced:1;	 /* mstamp advanced since last lost marking */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
 	u8	compressed_ack;
@@ -267,6 +266,7 @@ struct tcp_sock {
 	u8	ecn_flags;	/* ECN status bits.			*/
 	u8	keepalive_probes; /* num of allowed keep alive probes	*/
 	u32	reordering;	/* Packet reordering metric.		*/
+	u32	reord_seen;	/* number of data packet reordering events */
 	u32	snd_up;		/* Urgent pointer		*/
 
 /*
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 0e1c0aec0153..e02d31986ff9 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -239,6 +239,7 @@ struct tcp_info {
 	__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
 	__u64	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */
 	__u32	tcpi_dsack_dups;     /* RFC4898 tcpEStatsStackDSACKDups */
+	__u32	tcpi_reord_seen;     /* reordering events seen */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
@@ -264,6 +265,7 @@ enum {
 	TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */
 	TCP_NLA_BYTES_RETRANS,	/* Data bytes retransmitted */
 	TCP_NLA_DSACK_DUPS,	/* DSACK blocks received */
+	TCP_NLA_REORD_SEEN,	/* reordering events seen */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d6232b598cae..31fa1c080f28 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2597,6 +2597,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->bytes_sent = 0;
 	tp->bytes_retrans = 0;
 	tp->dsack_dups = 0;
+	tp->reord_seen = 0;
 
 	/* Clean up fastopen related fields */
 	tcp_free_fastopen_req(tp);
@@ -3207,6 +3208,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_bytes_sent = tp->bytes_sent;
 	info->tcpi_bytes_retrans = tp->bytes_retrans;
 	info->tcpi_dsack_dups = tp->dsack_dups;
+	info->tcpi_reord_seen = tp->reord_seen;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3234,6 +3236,7 @@ static size_t tcp_opt_stats_get_size(void)
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
 		0;
 }
 
@@ -3286,6 +3289,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
 			  TCP_NLA_PAD);
 	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
+	nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
 
 	return stats;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fbc85ff7d71d..3d6156f07a8d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -906,8 +906,8 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
 				       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
 	}
 
-	tp->rack.reord = 1;
 	/* This exciting event is worth to be remembered. 8) */
+	tp->reord_seen++;
 	NET_INC_STATS(sock_net(sk),
 		      ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
 }
@@ -1871,6 +1871,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 
 	tp->reordering = min_t(u32, tp->packets_out + addend,
 			       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
+	tp->reord_seen++;
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
 }
 
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 71593e4400ab..c81aadff769b 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -25,7 +25,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (!tp->rack.reord) {
+	if (!tp->reord_seen) {
 		/* If reordering has not been observed, be aggressive during
 		 * the recovery or starting the recovery by DUPACK threshold.
 		 */
-- 
cgit v1.2.3


From 83b15832ab91c9a4651decb6dc40075dd979d443 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 25 Jul 2018 12:38:10 -0400
Subject: media: doc-rst: Add packed Bayer raw14 pixel formats

These formats are compressed 14-bit raw bayer formats with four different
pixel orders. They are similar to 10-bit variants. The formats added by
this patch are

	V4L2_PIX_FMT_SBGGR14P
	V4L2_PIX_FMT_SGBRG14P
	V4L2_PIX_FMT_SGRBG14P
	V4L2_PIX_FMT_SRGGB14P

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Todor Tomov <todor.tomov@linaro.org>
Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/pixfmt-rgb.rst      |   1 +
 Documentation/media/uapi/v4l/pixfmt-srggb14p.rst | 127 +++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c             |   4 +
 include/uapi/linux/videodev2.h                   |   5 +
 4 files changed, 137 insertions(+)
 create mode 100644 Documentation/media/uapi/v4l/pixfmt-srggb14p.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/v4l/pixfmt-rgb.rst b/Documentation/media/uapi/v4l/pixfmt-rgb.rst
index cf2ef7df9616..1f9a7e3a07c9 100644
--- a/Documentation/media/uapi/v4l/pixfmt-rgb.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-rgb.rst
@@ -19,4 +19,5 @@ RGB Formats
     pixfmt-srggb10-ipu3
     pixfmt-srggb12
     pixfmt-srggb12p
+    pixfmt-srggb14p
     pixfmt-srggb16
diff --git a/Documentation/media/uapi/v4l/pixfmt-srggb14p.rst b/Documentation/media/uapi/v4l/pixfmt-srggb14p.rst
new file mode 100644
index 000000000000..88d20c0e4282
--- /dev/null
+++ b/Documentation/media/uapi/v4l/pixfmt-srggb14p.rst
@@ -0,0 +1,127 @@
+.. -*- coding: utf-8; mode: rst -*-
+
+.. _V4L2-PIX-FMT-SRGGB14P:
+.. _v4l2-pix-fmt-sbggr14p:
+.. _v4l2-pix-fmt-sgbrg14p:
+.. _v4l2-pix-fmt-sgrbg14p:
+
+*******************************************************************************************************************************
+V4L2_PIX_FMT_SRGGB14P ('pRCC'), V4L2_PIX_FMT_SGRBG14P ('pgCC'), V4L2_PIX_FMT_SGBRG14P ('pGCC'), V4L2_PIX_FMT_SBGGR14P ('pBCC'),
+*******************************************************************************************************************************
+
+*man V4L2_PIX_FMT_SRGGB14P(2)*
+
+V4L2_PIX_FMT_SGRBG14P
+V4L2_PIX_FMT_SGBRG14P
+V4L2_PIX_FMT_SBGGR14P
+14-bit packed Bayer formats
+
+
+Description
+===========
+
+These four pixel formats are packed raw sRGB / Bayer formats with 14
+bits per colour. Every four consecutive samples are packed into seven
+bytes. Each of the first four bytes contain the eight high order bits
+of the pixels, and the three following bytes contains the six least
+significants bits of each pixel, in the same order.
+
+Each n-pixel row contains n/2 green samples and n/2 blue or red samples,
+with alternating green-red and green-blue rows. They are conventionally
+described as GRGR... BGBG..., RGRG... GBGB..., etc. Below is an example
+of one of these formats:
+
+**Byte Order.**
+Each cell is one byte.
+
+
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       2 1 1 1 1 1 1 1
+
+
+    -  .. row 1
+
+       -  start + 0:
+
+       -  B\ :sub:`00high`
+
+       -  G\ :sub:`01high`
+
+       -  B\ :sub:`02high`
+
+       -  G\ :sub:`03high`
+
+       -  G\ :sub:`01low bits 1--0`\ (bits 7--6)
+	  B\ :sub:`00low bits 5--0`\ (bits 5--0)
+
+       -  R\ :sub:`02low bits 3--0`\ (bits 7--4)
+	  G\ :sub:`01low bits 5--2`\ (bits 3--0)
+
+       -  G\ :sub:`03low bits 5--0`\ (bits 7--2)
+	  R\ :sub:`02low bits 5--4`\ (bits 1--0)
+
+    -  .. row 2
+
+       -  start + 7:
+
+       -  G\ :sub:`00high`
+
+       -  R\ :sub:`01high`
+
+       -  G\ :sub:`02high`
+
+       -  R\ :sub:`03high`
+
+       -  R\ :sub:`01low bits 1--0`\ (bits 7--6)
+	  G\ :sub:`00low bits 5--0`\ (bits 5--0)
+
+       -  G\ :sub:`02low bits 3--0`\ (bits 7--4)
+	  R\ :sub:`01low bits 5--2`\ (bits 3--0)
+
+       -  R\ :sub:`03low bits 5--0`\ (bits 7--2)
+	  G\ :sub:`02low bits 5--4`\ (bits 1--0)
+
+    -  .. row 3
+
+       -  start + 14
+
+       -  B\ :sub:`20high`
+
+       -  G\ :sub:`21high`
+
+       -  B\ :sub:`22high`
+
+       -  G\ :sub:`23high`
+
+       -  G\ :sub:`21low bits 1--0`\ (bits 7--6)
+	  B\ :sub:`20low bits 5--0`\ (bits 5--0)
+
+       -  R\ :sub:`22low bits 3--0`\ (bits 7--4)
+	  G\ :sub:`21low bits 5--2`\ (bits 3--0)
+
+       -  G\ :sub:`23low bits 5--0`\ (bits 7--2)
+	  R\ :sub:`22low bits 5--4`\ (bits 1--0)
+
+    -  .. row 4
+
+       -  start + 21
+
+       -  G\ :sub:`30high`
+
+       -  R\ :sub:`31high`
+
+       -  G\ :sub:`32high`
+
+       -  R\ :sub:`33high`
+
+       -  R\ :sub:`31low bits 1--0`\ (bits 7--6)
+	  G\ :sub:`30low bits 5--0`\ (bits 5--0)
+
+       -  G\ :sub:`32low bits 3--0`\ (bits 7--4)
+	  R\ :sub:`31low bits 5--2`\ (bits 3--0)
+
+       -  R\ :sub:`33low bits 5--0`\ (bits 7--2)
+	  G\ :sub:`32low bits 5--4`\ (bits 1--0)
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 26d9702069fd..fd8d15f12307 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1259,6 +1259,10 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_SGBRG12P:	descr = "12-bit Bayer GBGB/RGRG Packed"; break;
 	case V4L2_PIX_FMT_SGRBG12P:	descr = "12-bit Bayer GRGR/BGBG Packed"; break;
 	case V4L2_PIX_FMT_SRGGB12P:	descr = "12-bit Bayer RGRG/GBGB Packed"; break;
+	case V4L2_PIX_FMT_SBGGR14P:	descr = "14-bit Bayer BGBG/GRGR Packed"; break;
+	case V4L2_PIX_FMT_SGBRG14P:	descr = "14-bit Bayer GBGB/RGRG Packed"; break;
+	case V4L2_PIX_FMT_SGRBG14P:	descr = "14-bit Bayer GRGR/BGBG Packed"; break;
+	case V4L2_PIX_FMT_SRGGB14P:	descr = "14-bit Bayer RGRG/GBGB Packed"; break;
 	case V4L2_PIX_FMT_SBGGR16:	descr = "16-bit Bayer BGBG/GRGR"; break;
 	case V4L2_PIX_FMT_SGBRG16:	descr = "16-bit Bayer GBGB/RGRG"; break;
 	case V4L2_PIX_FMT_SGRBG16:	descr = "16-bit Bayer GRGR/BGBG"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index d8b33095abe0..2c20b6aa5335 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -609,6 +609,11 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_SGBRG12P v4l2_fourcc('p', 'G', 'C', 'C')
 #define V4L2_PIX_FMT_SGRBG12P v4l2_fourcc('p', 'g', 'C', 'C')
 #define V4L2_PIX_FMT_SRGGB12P v4l2_fourcc('p', 'R', 'C', 'C')
+	/* 14bit raw bayer packed, 7 bytes for every 4 pixels */
+#define V4L2_PIX_FMT_SBGGR14P v4l2_fourcc('p', 'B', 'E', 'E')
+#define V4L2_PIX_FMT_SGBRG14P v4l2_fourcc('p', 'G', 'E', 'E')
+#define V4L2_PIX_FMT_SGRBG14P v4l2_fourcc('p', 'g', 'E', 'E')
+#define V4L2_PIX_FMT_SRGGB14P v4l2_fourcc('p', 'R', 'E', 'E')
 #define V4L2_PIX_FMT_SBGGR16 v4l2_fourcc('B', 'Y', 'R', '2') /* 16  BGBG.. GRGR.. */
 #define V4L2_PIX_FMT_SGBRG16 v4l2_fourcc('G', 'B', '1', '6') /* 16  GBGB.. RGRG.. */
 #define V4L2_PIX_FMT_SGRBG16 v4l2_fourcc('G', 'R', '1', '6') /* 16  GRGR.. BGBG.. */
-- 
cgit v1.2.3


From 451af0bf04bd313bbaddd67a8be09d10210780bd Mon Sep 17 00:00:00 2001
From: Todor Tomov <todor.tomov@linaro.org>
Date: Wed, 25 Jul 2018 12:38:11 -0400
Subject: media: v4l: Add new 2X8 10-bit grayscale media bus code

The code will be called MEDIA_BUS_FMT_Y10_2X8_PADHI_LE.
It is similar to MEDIA_BUS_FMT_SBGGR10_2X8_PADHI_LE
but MEDIA_BUS_FMT_Y10_2X8_PADHI_LE describes grayscale
data.

Signed-off-by: Todor Tomov <todor.tomov@linaro.org>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/subdev-formats.rst | 72 +++++++++++++++++++++++++
 include/uapi/linux/media-bus-format.h           |  3 +-
 2 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/v4l/subdev-formats.rst b/Documentation/media/uapi/v4l/subdev-formats.rst
index a4739f79d9c3..8e73fcfc6900 100644
--- a/Documentation/media/uapi/v4l/subdev-formats.rst
+++ b/Documentation/media/uapi/v4l/subdev-formats.rst
@@ -4318,6 +4318,78 @@ the following codes.
       - y\ :sub:`2`
       - y\ :sub:`1`
       - y\ :sub:`0`
+    * .. _MEDIA-BUS-FMT-Y10-2X8-PADHI_LE:
+
+      - MEDIA_BUS_FMT_Y10_2X8_PADHI_LE
+      - 0x202c
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      - y\ :sub:`7`
+      - y\ :sub:`6`
+      - y\ :sub:`5`
+      - y\ :sub:`4`
+      - y\ :sub:`3`
+      - y\ :sub:`2`
+      - y\ :sub:`1`
+      - y\ :sub:`0`
+    * -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      - 0
+      - 0
+      - 0
+      - 0
+      - 0
+      - 0
+      - y\ :sub:`9`
+      - y\ :sub:`8`
     * .. _MEDIA-BUS-FMT-UYVY10-2X10:
 
       - MEDIA_BUS_FMT_UYVY10_2X10
diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h
index 9e3511742fdc..d6a5a3bfe6c4 100644
--- a/include/uapi/linux/media-bus-format.h
+++ b/include/uapi/linux/media-bus-format.h
@@ -62,7 +62,7 @@
 #define MEDIA_BUS_FMT_RGB121212_1X36		0x1019
 #define MEDIA_BUS_FMT_RGB161616_1X48		0x101a
 
-/* YUV (including grey) - next is	0x202c */
+/* YUV (including grey) - next is	0x202d */
 #define MEDIA_BUS_FMT_Y8_1X8			0x2001
 #define MEDIA_BUS_FMT_UV8_1X8			0x2015
 #define MEDIA_BUS_FMT_UYVY8_1_5X8		0x2002
@@ -74,6 +74,7 @@
 #define MEDIA_BUS_FMT_YUYV8_2X8			0x2008
 #define MEDIA_BUS_FMT_YVYU8_2X8			0x2009
 #define MEDIA_BUS_FMT_Y10_1X10			0x200a
+#define MEDIA_BUS_FMT_Y10_2X8_PADHI_LE		0x202c
 #define MEDIA_BUS_FMT_UYVY10_2X10		0x2018
 #define MEDIA_BUS_FMT_VYUY10_2X10		0x2019
 #define MEDIA_BUS_FMT_YUYV10_2X10		0x200b
-- 
cgit v1.2.3


From 6e15bec49f366511ec024a556505316222ef4ade Mon Sep 17 00:00:00 2001
From: Todor Tomov <todor.tomov@linaro.org>
Date: Wed, 25 Jul 2018 12:38:12 -0400
Subject: media: v4l: Add new 10-bit packed grayscale format

The new format will be called V4L2_PIX_FMT_Y10P.
It is similar to the V4L2_PIX_FMT_SBGGR10P family formats
but V4L2_PIX_FMT_Y10P is a grayscale format.

Signed-off-by: Todor Tomov <todor.tomov@linaro.org>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/media/uapi/v4l/pixfmt-y10p.rst | 33 ++++++++++++++++++++++++++++
 Documentation/media/uapi/v4l/yuv-formats.rst |  1 +
 drivers/media/v4l2-core/v4l2-ioctl.c         |  1 +
 include/uapi/linux/videodev2.h               |  1 +
 4 files changed, 36 insertions(+)
 create mode 100644 Documentation/media/uapi/v4l/pixfmt-y10p.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/media/uapi/v4l/pixfmt-y10p.rst b/Documentation/media/uapi/v4l/pixfmt-y10p.rst
new file mode 100644
index 000000000000..13b571306915
--- /dev/null
+++ b/Documentation/media/uapi/v4l/pixfmt-y10p.rst
@@ -0,0 +1,33 @@
+.. -*- coding: utf-8; mode: rst -*-
+
+.. _V4L2-PIX-FMT-Y10P:
+
+******************************
+V4L2_PIX_FMT_Y10P ('Y10P')
+******************************
+
+Grey-scale image as a MIPI RAW10 packed array
+
+
+Description
+===========
+
+This is a packed grey-scale image format with a depth of 10 bits per
+pixel. Every four consecutive pixels are packed into 5 bytes. Each of
+the first 4 bytes contain the 8 high order bits of the pixels, and
+the 5th byte contains the 2 least significants bits of each pixel,
+in the same order.
+
+**Bit-packed representation.**
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths: 8 8 8 8 64
+
+    * - Y'\ :sub:`00[9:2]`
+      - Y'\ :sub:`01[9:2]`
+      - Y'\ :sub:`02[9:2]`
+      - Y'\ :sub:`03[9:2]`
+      - Y'\ :sub:`03[1:0]`\ (bits 7--6) Y'\ :sub:`02[1:0]`\ (bits 5--4)
+	Y'\ :sub:`01[1:0]`\ (bits 3--2) Y'\ :sub:`00[1:0]`\ (bits 1--0)
diff --git a/Documentation/media/uapi/v4l/yuv-formats.rst b/Documentation/media/uapi/v4l/yuv-formats.rst
index 3334ea445657..9ab0592d08da 100644
--- a/Documentation/media/uapi/v4l/yuv-formats.rst
+++ b/Documentation/media/uapi/v4l/yuv-formats.rst
@@ -29,6 +29,7 @@ to brightness information.
     pixfmt-y10
     pixfmt-y12
     pixfmt-y10b
+    pixfmt-y10p
     pixfmt-y16
     pixfmt-y16-be
     pixfmt-y8i
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index fd8d15f12307..54afc9c7ee6e 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1184,6 +1184,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_Y16:		descr = "16-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y16_BE:	descr = "16-bit Greyscale BE"; break;
 	case V4L2_PIX_FMT_Y10BPACK:	descr = "10-bit Greyscale (Packed)"; break;
+	case V4L2_PIX_FMT_Y10P:		descr = "10-bit Greyscale (MIPI Packed)"; break;
 	case V4L2_PIX_FMT_Y8I:		descr = "Interleaved 8-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y12I:		descr = "Interleaved 12-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Z16:		descr = "16-bit Depth"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 2c20b6aa5335..5d1a3685bea9 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -522,6 +522,7 @@ struct v4l2_pix_format {
 
 /* Grey bit-packed formats */
 #define V4L2_PIX_FMT_Y10BPACK    v4l2_fourcc('Y', '1', '0', 'B') /* 10  Greyscale bit-packed */
+#define V4L2_PIX_FMT_Y10P    v4l2_fourcc('Y', '1', '0', 'P') /* 10  Greyscale, MIPI RAW10 packed */
 
 /* Palette formats */
 #define V4L2_PIX_FMT_PAL8    v4l2_fourcc('P', 'A', 'L', '8') /*  8  8-bit palette */
-- 
cgit v1.2.3


From de9cbbaadba5adf88a19e46df61f7054000838f6 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 2 Aug 2018 14:27:18 -0700
Subject: bpf: introduce cgroup storage maps

This commit introduces BPF_MAP_TYPE_CGROUP_STORAGE maps:
a special type of maps which are implementing the cgroup storage.

>From the userspace point of view it's almost a generic
hash map with the (cgroup inode id, attachment type) pair
used as a key.

The only difference is that some operations are restricted:
  1) a user can't create new entries,
  2) a user can't remove existing entries.

The lookup from userspace is o(log(n)).

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h |  38 +++++
 include/linux/bpf.h        |   1 +
 include/linux/bpf_types.h  |   3 +
 include/uapi/linux/bpf.h   |   6 +
 kernel/bpf/Makefile        |   1 +
 kernel/bpf/local_storage.c | 376 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       |   3 +
 kernel/bpf/verifier.c      |  12 ++
 8 files changed, 440 insertions(+)
 create mode 100644 kernel/bpf/local_storage.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d50c2f0a655a..7d00d58869ed 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -4,19 +4,39 @@
 
 #include <linux/errno.h>
 #include <linux/jump_label.h>
+#include <linux/rbtree.h>
 #include <uapi/linux/bpf.h>
 
 struct sock;
 struct sockaddr;
 struct cgroup;
 struct sk_buff;
+struct bpf_map;
+struct bpf_prog;
 struct bpf_sock_ops_kern;
+struct bpf_cgroup_storage;
 
 #ifdef CONFIG_CGROUP_BPF
 
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
+struct bpf_cgroup_storage_map;
+
+struct bpf_storage_buffer {
+	struct rcu_head rcu;
+	char data[0];
+};
+
+struct bpf_cgroup_storage {
+	struct bpf_storage_buffer *buf;
+	struct bpf_cgroup_storage_map *map;
+	struct bpf_cgroup_storage_key key;
+	struct list_head list;
+	struct rb_node node;
+	struct rcu_head rcu;
+};
+
 struct bpf_prog_list {
 	struct list_head node;
 	struct bpf_prog *prog;
@@ -77,6 +97,15 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 				      short access, enum bpf_attach_type type);
 
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog);
+void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
+void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
+			     struct cgroup *cgroup,
+			     enum bpf_attach_type type);
+void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
+int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
+void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -221,6 +250,15 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
+static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
+					    struct bpf_map *map) { return 0; }
+static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
+					      struct bpf_map *map) {}
+static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
+	struct bpf_prog *prog) { return 0; }
+static inline void bpf_cgroup_storage_free(
+	struct bpf_cgroup_storage *storage) {}
+
 #define cgroup_bpf_enabled (0)
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5a4a256473c3..9d1e4727495e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -282,6 +282,7 @@ struct bpf_prog_aux {
 	struct bpf_prog *prog;
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
+	struct bpf_map *cgroup_storage;
 	char name[BPF_OBJ_NAME_LEN];
 #ifdef CONFIG_SECURITY
 	void *security;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c5700c2d5549..add08be53b6f 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -37,6 +37,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops)
 #ifdef CONFIG_CGROUPS
 BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
 #endif
+#ifdef CONFIG_CGROUP_BPF
+BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
+#endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0ebaaf7f3568..b10118ee5afe 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -75,6 +75,11 @@ struct bpf_lpm_trie_key {
 	__u8	data[0];	/* Arbitrary size */
 };
 
+struct bpf_cgroup_storage_key {
+	__u64	cgroup_inode_id;	/* cgroup inode id */
+	__u32	attach_type;		/* program attach type */
+};
+
 /* BPF syscall commands, see bpf(2) man-page for details. */
 enum bpf_cmd {
 	BPF_MAP_CREATE,
@@ -120,6 +125,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CPUMAP,
 	BPF_MAP_TYPE_XSKMAP,
 	BPF_MAP_TYPE_SOCKHASH,
+	BPF_MAP_TYPE_CGROUP_STORAGE,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f27f5496d6fe..e8906cbad81f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,6 +3,7 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
new file mode 100644
index 000000000000..f23d3fdeba23
--- /dev/null
+++ b/kernel/bpf/local_storage.c
@@ -0,0 +1,376 @@
+//SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf-cgroup.h>
+#include <linux/bpf.h>
+#include <linux/bug.h>
+#include <linux/filter.h>
+#include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+
+#ifdef CONFIG_CGROUP_BPF
+
+#define LOCAL_STORAGE_CREATE_FLAG_MASK					\
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+struct bpf_cgroup_storage_map {
+	struct bpf_map map;
+
+	spinlock_t lock;
+	struct bpf_prog *prog;
+	struct rb_root root;
+	struct list_head list;
+};
+
+static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map)
+{
+	return container_of(map, struct bpf_cgroup_storage_map, map);
+}
+
+static int bpf_cgroup_storage_key_cmp(
+	const struct bpf_cgroup_storage_key *key1,
+	const struct bpf_cgroup_storage_key *key2)
+{
+	if (key1->cgroup_inode_id < key2->cgroup_inode_id)
+		return -1;
+	else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
+		return 1;
+	else if (key1->attach_type < key2->attach_type)
+		return -1;
+	else if (key1->attach_type > key2->attach_type)
+		return 1;
+	return 0;
+}
+
+static struct bpf_cgroup_storage *cgroup_storage_lookup(
+	struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key,
+	bool locked)
+{
+	struct rb_root *root = &map->root;
+	struct rb_node *node;
+
+	if (!locked)
+		spin_lock_bh(&map->lock);
+
+	node = root->rb_node;
+	while (node) {
+		struct bpf_cgroup_storage *storage;
+
+		storage = container_of(node, struct bpf_cgroup_storage, node);
+
+		switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) {
+		case -1:
+			node = node->rb_left;
+			break;
+		case 1:
+			node = node->rb_right;
+			break;
+		default:
+			if (!locked)
+				spin_unlock_bh(&map->lock);
+			return storage;
+		}
+	}
+
+	if (!locked)
+		spin_unlock_bh(&map->lock);
+
+	return NULL;
+}
+
+static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
+				 struct bpf_cgroup_storage *storage)
+{
+	struct rb_root *root = &map->root;
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	while (*new) {
+		struct bpf_cgroup_storage *this;
+
+		this = container_of(*new, struct bpf_cgroup_storage, node);
+
+		parent = *new;
+		switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) {
+		case -1:
+			new = &((*new)->rb_left);
+			break;
+		case 1:
+			new = &((*new)->rb_right);
+			break;
+		default:
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&storage->node, parent, new);
+	rb_insert_color(&storage->node, root);
+
+	return 0;
+}
+
+static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	struct bpf_cgroup_storage_key *key = _key;
+	struct bpf_cgroup_storage *storage;
+
+	storage = cgroup_storage_lookup(map, key, false);
+	if (!storage)
+		return NULL;
+
+	return &READ_ONCE(storage->buf)->data[0];
+}
+
+static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
+				      void *value, u64 flags)
+{
+	struct bpf_cgroup_storage_key *key = _key;
+	struct bpf_cgroup_storage *storage;
+	struct bpf_storage_buffer *new;
+
+	if (flags & BPF_NOEXIST)
+		return -EINVAL;
+
+	storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
+					key, false);
+	if (!storage)
+		return -ENOENT;
+
+	new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
+			   map->value_size, __GFP_ZERO | GFP_USER,
+			   map->numa_node);
+	if (!new)
+		return -ENOMEM;
+
+	memcpy(&new->data[0], value, map->value_size);
+
+	new = xchg(&storage->buf, new);
+	kfree_rcu(new, rcu);
+
+	return 0;
+}
+
+static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
+				       void *_next_key)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	struct bpf_cgroup_storage_key *key = _key;
+	struct bpf_cgroup_storage_key *next = _next_key;
+	struct bpf_cgroup_storage *storage;
+
+	spin_lock_bh(&map->lock);
+
+	if (list_empty(&map->list))
+		goto enoent;
+
+	if (key) {
+		storage = cgroup_storage_lookup(map, key, true);
+		if (!storage)
+			goto enoent;
+
+		storage = list_next_entry(storage, list);
+		if (!storage)
+			goto enoent;
+	} else {
+		storage = list_first_entry(&map->list,
+					 struct bpf_cgroup_storage, list);
+	}
+
+	spin_unlock_bh(&map->lock);
+	next->attach_type = storage->key.attach_type;
+	next->cgroup_inode_id = storage->key.cgroup_inode_id;
+	return 0;
+
+enoent:
+	spin_unlock_bh(&map->lock);
+	return -ENOENT;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+	int numa_node = bpf_map_attr_numa_node(attr);
+	struct bpf_cgroup_storage_map *map;
+
+	if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
+		return ERR_PTR(-EINVAL);
+
+	if (attr->value_size > PAGE_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK)
+		/* reserved bits should not be used */
+		return ERR_PTR(-EINVAL);
+
+	if (attr->max_entries)
+		/* max_entries is not used and enforced to be 0 */
+		return ERR_PTR(-EINVAL);
+
+	map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
+			   __GFP_ZERO | GFP_USER, numa_node);
+	if (!map)
+		return ERR_PTR(-ENOMEM);
+
+	map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map),
+				  PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* copy mandatory map attributes */
+	bpf_map_init_from_attr(&map->map, attr);
+
+	spin_lock_init(&map->lock);
+	map->root = RB_ROOT;
+	INIT_LIST_HEAD(&map->list);
+
+	return &map->map;
+}
+
+static void cgroup_storage_map_free(struct bpf_map *_map)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+
+	WARN_ON(!RB_EMPTY_ROOT(&map->root));
+	WARN_ON(!list_empty(&map->list));
+
+	kfree(map);
+}
+
+static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+{
+	return -EINVAL;
+}
+
+const struct bpf_map_ops cgroup_storage_map_ops = {
+	.map_alloc = cgroup_storage_map_alloc,
+	.map_free = cgroup_storage_map_free,
+	.map_get_next_key = cgroup_storage_get_next_key,
+	.map_lookup_elem = cgroup_storage_lookup_elem,
+	.map_update_elem = cgroup_storage_update_elem,
+	.map_delete_elem = cgroup_storage_delete_elem,
+};
+
+int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	int ret = -EBUSY;
+
+	spin_lock_bh(&map->lock);
+
+	if (map->prog && map->prog != prog)
+		goto unlock;
+	if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map)
+		goto unlock;
+
+	map->prog = prog;
+	prog->aux->cgroup_storage = _map;
+	ret = 0;
+unlock:
+	spin_unlock_bh(&map->lock);
+
+	return ret;
+}
+
+void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+
+	spin_lock_bh(&map->lock);
+	if (map->prog == prog) {
+		WARN_ON(prog->aux->cgroup_storage != _map);
+		map->prog = NULL;
+		prog->aux->cgroup_storage = NULL;
+	}
+	spin_unlock_bh(&map->lock);
+}
+
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog)
+{
+	struct bpf_cgroup_storage *storage;
+	struct bpf_map *map;
+	u32 pages;
+
+	map = prog->aux->cgroup_storage;
+	if (!map)
+		return NULL;
+
+	pages = round_up(sizeof(struct bpf_cgroup_storage) +
+			 sizeof(struct bpf_storage_buffer) +
+			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+	if (bpf_map_charge_memlock(map, pages))
+		return ERR_PTR(-EPERM);
+
+	storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
+			       __GFP_ZERO | GFP_USER, map->numa_node);
+	if (!storage) {
+		bpf_map_uncharge_memlock(map, pages);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
+				    map->value_size, __GFP_ZERO | GFP_USER,
+				    map->numa_node);
+	if (!storage->buf) {
+		bpf_map_uncharge_memlock(map, pages);
+		kfree(storage);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	storage->map = (struct bpf_cgroup_storage_map *)map;
+
+	return storage;
+}
+
+void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
+{
+	u32 pages;
+	struct bpf_map *map;
+
+	if (!storage)
+		return;
+
+	map = &storage->map->map;
+	pages = round_up(sizeof(struct bpf_cgroup_storage) +
+			 sizeof(struct bpf_storage_buffer) +
+			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+	bpf_map_uncharge_memlock(map, pages);
+
+	kfree_rcu(storage->buf, rcu);
+	kfree_rcu(storage, rcu);
+}
+
+void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
+			     struct cgroup *cgroup,
+			     enum bpf_attach_type type)
+{
+	struct bpf_cgroup_storage_map *map;
+
+	if (!storage)
+		return;
+
+	storage->key.attach_type = type;
+	storage->key.cgroup_inode_id = cgroup->kn->id.id;
+
+	map = storage->map;
+
+	spin_lock_bh(&map->lock);
+	WARN_ON(cgroup_storage_insert(map, storage));
+	list_add(&storage->list, &map->list);
+	spin_unlock_bh(&map->lock);
+}
+
+void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage)
+{
+	struct bpf_cgroup_storage_map *map;
+	struct rb_root *root;
+
+	if (!storage)
+		return;
+
+	map = storage->map;
+
+	spin_lock_bh(&map->lock);
+	root = &map->root;
+	rb_erase(&storage->node, root);
+
+	list_del(&storage->list);
+	spin_unlock_bh(&map->lock);
+}
+
+#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7958252a4d29..5af4e9e2722d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -957,6 +957,9 @@ static void free_used_maps(struct bpf_prog_aux *aux)
 {
 	int i;
 
+	if (aux->cgroup_storage)
+		bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage);
+
 	for (i = 0; i < aux->used_map_cnt; i++)
 		bpf_map_put(aux->used_maps[i]);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e948303a0ea8..7e75434a9e54 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5154,6 +5154,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			}
 			env->used_maps[env->used_map_cnt++] = map;
 
+			if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
+			    bpf_cgroup_storage_assign(env->prog, map)) {
+				verbose(env,
+					"only one cgroup storage is allowed\n");
+				fdput(f);
+				return -EBUSY;
+			}
+
 			fdput(f);
 next_insn:
 			insn++;
@@ -5180,6 +5188,10 @@ static void release_maps(struct bpf_verifier_env *env)
 {
 	int i;
 
+	if (env->prog->aux->cgroup_storage)
+		bpf_cgroup_storage_release(env->prog,
+					   env->prog->aux->cgroup_storage);
+
 	for (i = 0; i < env->used_map_cnt; i++)
 		bpf_map_put(env->used_maps[i]);
 }
-- 
cgit v1.2.3


From cd3394317653837e2eb5c5d0904a8996102af9fc Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 2 Aug 2018 14:27:24 -0700
Subject: bpf: introduce the bpf_get_local_storage() helper function

The bpf_get_local_storage() helper function is used
to get a pointer to the bpf local storage from a bpf program.

It takes a pointer to a storage map and flags as arguments.
Right now it accepts only cgroup storage maps, and flags
argument has to be 0. Further it can be extended to support
other types of local storage: e.g. thread local storage etc.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h | 21 ++++++++++++++++++++-
 kernel/bpf/cgroup.c      |  2 ++
 kernel/bpf/core.c        |  1 +
 kernel/bpf/helpers.c     | 20 ++++++++++++++++++++
 kernel/bpf/verifier.c    | 18 ++++++++++++++++++
 net/core/filter.c        | 23 ++++++++++++++++++++++-
 7 files changed, 85 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ca4ac2a39def..cd8790d2c6ed 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -788,6 +788,8 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
 
+extern const struct bpf_func_proto bpf_get_local_storage_proto;
+
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b10118ee5afe..dd5758dc35d3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2095,6 +2095,24 @@ union bpf_attr {
  * 	Return
  * 		A 64-bit integer containing the current cgroup id based
  * 		on the cgroup within which the current task is running.
+ *
+ * void* get_local_storage(void *map, u64 flags)
+ *	Description
+ *		Get the pointer to the local storage area.
+ *		The type and the size of the local storage is defined
+ *		by the *map* argument.
+ *		The *flags* meaning is specific for each map type,
+ *		and has to be 0 for cgroup local storage.
+ *
+ *		Depending on the bpf program type, a local storage area
+ *		can be shared between multiple instances of the bpf program,
+ *		running simultaneously.
+ *
+ *		A user should care about the synchronization by himself.
+ *		For example, by using the BPF_STX_XADD instruction to alter
+ *		the shared data.
+ *	Return
+ *		Pointer to the local storage area.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2177,7 +2195,8 @@ union bpf_attr {
 	FN(rc_repeat),			\
 	FN(rc_keydown),			\
 	FN(skb_cgroup_id),		\
-	FN(get_current_cgroup_id),
+	FN(get_current_cgroup_id),	\
+	FN(get_local_storage),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ddfa6cc13e57..0a4fe5a7dc91 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -684,6 +684,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_map_delete_elem_proto;
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_trace_printk:
 		if (capable(CAP_SYS_ADMIN))
 			return bpf_get_trace_printk_proto();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9abcf25ebf9f..4d09e610777f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1795,6 +1795,7 @@ const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
 const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
 const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
+const struct bpf_func_proto bpf_get_local_storage_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 73065e2d23c2..1991466b8327 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -193,4 +193,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 };
+
+DECLARE_PER_CPU(void*, bpf_cgroup_storage);
+
+BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
+{
+	/* map and flags arguments are not used now,
+	 * but provide an ability to extend the API
+	 * for other types of local storages.
+	 * verifier checks that their values are correct.
+	 */
+	return (unsigned long) this_cpu_read(bpf_cgroup_storage);
+}
+
+const struct bpf_func_proto bpf_get_local_storage_proto = {
+	.func		= bpf_get_local_storage,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_MAP_VALUE,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+};
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1ede16c8bb40..587468a9c37d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2127,6 +2127,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_current_task_under_cgroup)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_CGROUP_STORAGE:
+		if (func_id != BPF_FUNC_get_local_storage)
+			goto error;
+		break;
 	/* devmap returns a pointer to a live net_device ifindex that we cannot
 	 * allow to be modified from bpf side. So do not allow lookup elements
 	 * for now.
@@ -2209,6 +2213,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
 			goto error;
 		break;
+	case BPF_FUNC_get_local_storage:
+		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2533,6 +2541,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	}
 
 	regs = cur_regs(env);
+
+	/* check that flags argument in get_local_storage(map, flags) is 0,
+	 * this is required because get_local_storage() can't return an error.
+	 */
+	if (func_id == BPF_FUNC_get_local_storage &&
+	    !register_is_null(&regs[BPF_REG_2])) {
+		verbose(env, "get_local_storage() doesn't support non-zero flags\n");
+		return -EINVAL;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
diff --git a/net/core/filter.c b/net/core/filter.c
index 9bb9a4488e25..9f73aae2f089 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4820,6 +4820,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	 */
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4844,6 +4846,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		}
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_sock_addr_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4866,6 +4870,17 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
+	default:
+		return sk_filter_func_proto(func_id, prog);
+	}
+}
+
 static const struct bpf_func_proto *
 tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -4988,6 +5003,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sock_hash_update_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_sock_ops_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -5007,6 +5024,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_msg_cork_bytes_proto;
 	case BPF_FUNC_msg_pull_data:
 		return &bpf_msg_pull_data_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -5034,6 +5053,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_redirect_map_proto;
 	case BPF_FUNC_sk_redirect_hash:
 		return &bpf_sk_redirect_hash_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6838,7 +6859,7 @@ const struct bpf_prog_ops xdp_prog_ops = {
 };
 
 const struct bpf_verifier_ops cg_skb_verifier_ops = {
-	.get_func_proto		= sk_filter_func_proto,
+	.get_func_proto		= cg_skb_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
-- 
cgit v1.2.3


From 7cca1ed0bb248b8d5768d17f5afe297a832d66c0 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Tue, 31 Jul 2018 20:25:00 +0200
Subject: netfilter: nf_osf: move nf_osf_fingers to non-uapi header file

All warnings (new ones prefixed by >>):

>> ./usr/include/linux/netfilter/nf_osf.h:73: userspace cannot reference function or variable defined in the kernel

Fixes: f9324952088f ("netfilter: nfnetlink_osf: extract nfnetlink_subsystem code from xt_osf.c")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_osf.h      | 2 ++
 include/uapi/linux/netfilter/nf_osf.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h
index aee460fcbd31..3e455d6f94d5 100644
--- a/include/linux/netfilter/nf_osf.h
+++ b/include/linux/netfilter/nf_osf.h
@@ -25,6 +25,8 @@ enum osf_fmatch_states {
 	FMATCH_OPT_WRONG,
 };
 
+extern struct list_head nf_osf_fingers[2];
+
 struct nf_osf_finger {
 	struct rcu_head			rcu_head;
 	struct list_head		finger_entry;
diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
index cc2487ff74f6..3b93fbb9fc24 100644
--- a/include/uapi/linux/netfilter/nf_osf.h
+++ b/include/uapi/linux/netfilter/nf_osf.h
@@ -70,8 +70,6 @@ struct nf_osf_nlmsg {
 	struct tcphdr			tcp;
 };
 
-extern struct list_head nf_osf_fingers[2];
-
 /* Defines for IANA option kinds */
 enum iana_options {
 	OSFOPT_EOL = 0,		/* End of options */
-- 
cgit v1.2.3


From ddba40be59c9be4059288464f8e6f38fbba27495 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Tue, 31 Jul 2018 20:25:01 +0200
Subject: netfilter: nfnetlink_osf: rename nf_osf header file to nfnetlink_osf

The first client of the nf_osf.h userspace header is nft_osf, coming in
this batch, rename it to nfnetlink_osf.h as there are no userspace
clients for this yet, hence this looks consistent with other nfnetlink
subsystem.

Suggested-by: Jan Engelhardt <jengelh@inai.de>
Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_osf.h             |  44 -----------
 include/linux/netfilter/nfnetlink_osf.h      |  44 +++++++++++
 include/uapi/linux/netfilter/nf_osf.h        | 106 ---------------------------
 include/uapi/linux/netfilter/nfnetlink_osf.h | 106 +++++++++++++++++++++++++++
 include/uapi/linux/netfilter/xt_osf.h        |   2 +-
 net/netfilter/nfnetlink_osf.c                |   2 +-
 net/netfilter/nft_osf.c                      |   2 +-
 7 files changed, 153 insertions(+), 153 deletions(-)
 delete mode 100644 include/linux/netfilter/nf_osf.h
 create mode 100644 include/linux/netfilter/nfnetlink_osf.h
 delete mode 100644 include/uapi/linux/netfilter/nf_osf.h
 create mode 100644 include/uapi/linux/netfilter/nfnetlink_osf.h

(limited to 'include/uapi/linux')

diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h
deleted file mode 100644
index 3e455d6f94d5..000000000000
--- a/include/linux/netfilter/nf_osf.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NFOSF_H
-#define _NFOSF_H
-
-#include <uapi/linux/netfilter/nf_osf.h>
-
-/* Initial window size option state machine: multiple of mss, mtu or
- * plain numeric value. Can also be made as plain numeric value which
- * is not a multiple of specified value.
- */
-enum nf_osf_window_size_options {
-	OSF_WSS_PLAIN   = 0,
-	OSF_WSS_MSS,
-	OSF_WSS_MTU,
-	OSF_WSS_MODULO,
-	OSF_WSS_MAX,
-};
-
-enum osf_fmatch_states {
-	/* Packet does not match the fingerprint */
-	FMATCH_WRONG = 0,
-	/* Packet matches the fingerprint */
-	FMATCH_OK,
-	/* Options do not match the fingerprint, but header does */
-	FMATCH_OPT_WRONG,
-};
-
-extern struct list_head nf_osf_fingers[2];
-
-struct nf_osf_finger {
-	struct rcu_head			rcu_head;
-	struct list_head		finger_entry;
-	struct nf_osf_user_finger	finger;
-};
-
-bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
-		  int hooknum, struct net_device *in, struct net_device *out,
-		  const struct nf_osf_info *info, struct net *net,
-		  const struct list_head *nf_osf_fingers);
-
-const char *nf_osf_find(const struct sk_buff *skb,
-                        const struct list_head *nf_osf_fingers);
-
-#endif /* _NFOSF_H */
diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h
new file mode 100644
index 000000000000..a7311bc03d3a
--- /dev/null
+++ b/include/linux/netfilter/nfnetlink_osf.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NFOSF_H
+#define _NFOSF_H
+
+#include <uapi/linux/netfilter/nfnetlink_osf.h>
+
+/* Initial window size option state machine: multiple of mss, mtu or
+ * plain numeric value. Can also be made as plain numeric value which
+ * is not a multiple of specified value.
+ */
+enum nf_osf_window_size_options {
+	OSF_WSS_PLAIN   = 0,
+	OSF_WSS_MSS,
+	OSF_WSS_MTU,
+	OSF_WSS_MODULO,
+	OSF_WSS_MAX,
+};
+
+enum osf_fmatch_states {
+	/* Packet does not match the fingerprint */
+	FMATCH_WRONG = 0,
+	/* Packet matches the fingerprint */
+	FMATCH_OK,
+	/* Options do not match the fingerprint, but header does */
+	FMATCH_OPT_WRONG,
+};
+
+extern struct list_head nf_osf_fingers[2];
+
+struct nf_osf_finger {
+	struct rcu_head			rcu_head;
+	struct list_head		finger_entry;
+	struct nf_osf_user_finger	finger;
+};
+
+bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+		  int hooknum, struct net_device *in, struct net_device *out,
+		  const struct nf_osf_info *info, struct net *net,
+		  const struct list_head *nf_osf_fingers);
+
+const char *nf_osf_find(const struct sk_buff *skb,
+                        const struct list_head *nf_osf_fingers);
+
+#endif /* _NFOSF_H */
diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
deleted file mode 100644
index 3b93fbb9fc24..000000000000
--- a/include/uapi/linux/netfilter/nf_osf.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef _NF_OSF_H
-#define _NF_OSF_H
-
-#include <linux/types.h>
-
-#define MAXGENRELEN	32
-
-#define NF_OSF_GENRE	(1 << 0)
-#define NF_OSF_TTL	(1 << 1)
-#define NF_OSF_LOG	(1 << 2)
-#define NF_OSF_INVERT	(1 << 3)
-
-#define NF_OSF_LOGLEVEL_ALL		0	/* log all matched fingerprints */
-#define NF_OSF_LOGLEVEL_FIRST		1	/* log only the first matced fingerprint */
-#define NF_OSF_LOGLEVEL_ALL_KNOWN	2	/* do not log unknown packets */
-
-#define NF_OSF_TTL_TRUE			0	/* True ip and fingerprint TTL comparison */
-
-/* Check if ip TTL is less than fingerprint one */
-#define NF_OSF_TTL_LESS			1
-
-/* Do not compare ip and fingerprint TTL at all */
-#define NF_OSF_TTL_NOCHECK		2
-
-#define NF_OSF_FLAGMASK		(NF_OSF_GENRE | NF_OSF_TTL | \
-				 NF_OSF_LOG | NF_OSF_INVERT)
-/* Wildcard MSS (kind of).
- * It is used to implement a state machine for the different wildcard values
- * of the MSS and window sizes.
- */
-struct nf_osf_wc {
-	__u32	wc;
-	__u32	val;
-};
-
-/* This struct represents IANA options
- * http://www.iana.org/assignments/tcp-parameters
- */
-struct nf_osf_opt {
-	__u16			kind, length;
-	struct nf_osf_wc	wc;
-};
-
-struct nf_osf_info {
-	char	genre[MAXGENRELEN];
-	__u32	len;
-	__u32	flags;
-	__u32	loglevel;
-	__u32	ttl;
-};
-
-struct nf_osf_user_finger {
-	struct nf_osf_wc	wss;
-
-	__u8	ttl, df;
-	__u16	ss, mss;
-	__u16	opt_num;
-
-	char	genre[MAXGENRELEN];
-	char	version[MAXGENRELEN];
-	char	subtype[MAXGENRELEN];
-
-	/* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */
-	struct nf_osf_opt	opt[MAX_IPOPTLEN];
-};
-
-struct nf_osf_nlmsg {
-	struct nf_osf_user_finger	f;
-	struct iphdr			ip;
-	struct tcphdr			tcp;
-};
-
-/* Defines for IANA option kinds */
-enum iana_options {
-	OSFOPT_EOL = 0,		/* End of options */
-	OSFOPT_NOP,		/* NOP */
-	OSFOPT_MSS,		/* Maximum segment size */
-	OSFOPT_WSO,		/* Window scale option */
-	OSFOPT_SACKP,		/* SACK permitted */
-	OSFOPT_SACK,		/* SACK */
-	OSFOPT_ECHO,
-	OSFOPT_ECHOREPLY,
-	OSFOPT_TS,		/* Timestamp option */
-	OSFOPT_POCP,		/* Partial Order Connection Permitted */
-	OSFOPT_POSP,		/* Partial Order Service Profile */
-
-	/* Others are not used in the current OSF */
-	OSFOPT_EMPTY = 255,
-};
-
-enum nf_osf_attr_type {
-	OSF_ATTR_UNSPEC,
-	OSF_ATTR_FINGER,
-	OSF_ATTR_MAX,
-};
-
-/*
- * Add/remove fingerprint from the kernel.
- */
-enum nf_osf_msg_types {
-	OSF_MSG_ADD,
-	OSF_MSG_REMOVE,
-	OSF_MSG_MAX,
-};
-
-#endif /* _NF_OSF_H */
diff --git a/include/uapi/linux/netfilter/nfnetlink_osf.h b/include/uapi/linux/netfilter/nfnetlink_osf.h
new file mode 100644
index 000000000000..3b93fbb9fc24
--- /dev/null
+++ b/include/uapi/linux/netfilter/nfnetlink_osf.h
@@ -0,0 +1,106 @@
+#ifndef _NF_OSF_H
+#define _NF_OSF_H
+
+#include <linux/types.h>
+
+#define MAXGENRELEN	32
+
+#define NF_OSF_GENRE	(1 << 0)
+#define NF_OSF_TTL	(1 << 1)
+#define NF_OSF_LOG	(1 << 2)
+#define NF_OSF_INVERT	(1 << 3)
+
+#define NF_OSF_LOGLEVEL_ALL		0	/* log all matched fingerprints */
+#define NF_OSF_LOGLEVEL_FIRST		1	/* log only the first matced fingerprint */
+#define NF_OSF_LOGLEVEL_ALL_KNOWN	2	/* do not log unknown packets */
+
+#define NF_OSF_TTL_TRUE			0	/* True ip and fingerprint TTL comparison */
+
+/* Check if ip TTL is less than fingerprint one */
+#define NF_OSF_TTL_LESS			1
+
+/* Do not compare ip and fingerprint TTL at all */
+#define NF_OSF_TTL_NOCHECK		2
+
+#define NF_OSF_FLAGMASK		(NF_OSF_GENRE | NF_OSF_TTL | \
+				 NF_OSF_LOG | NF_OSF_INVERT)
+/* Wildcard MSS (kind of).
+ * It is used to implement a state machine for the different wildcard values
+ * of the MSS and window sizes.
+ */
+struct nf_osf_wc {
+	__u32	wc;
+	__u32	val;
+};
+
+/* This struct represents IANA options
+ * http://www.iana.org/assignments/tcp-parameters
+ */
+struct nf_osf_opt {
+	__u16			kind, length;
+	struct nf_osf_wc	wc;
+};
+
+struct nf_osf_info {
+	char	genre[MAXGENRELEN];
+	__u32	len;
+	__u32	flags;
+	__u32	loglevel;
+	__u32	ttl;
+};
+
+struct nf_osf_user_finger {
+	struct nf_osf_wc	wss;
+
+	__u8	ttl, df;
+	__u16	ss, mss;
+	__u16	opt_num;
+
+	char	genre[MAXGENRELEN];
+	char	version[MAXGENRELEN];
+	char	subtype[MAXGENRELEN];
+
+	/* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */
+	struct nf_osf_opt	opt[MAX_IPOPTLEN];
+};
+
+struct nf_osf_nlmsg {
+	struct nf_osf_user_finger	f;
+	struct iphdr			ip;
+	struct tcphdr			tcp;
+};
+
+/* Defines for IANA option kinds */
+enum iana_options {
+	OSFOPT_EOL = 0,		/* End of options */
+	OSFOPT_NOP,		/* NOP */
+	OSFOPT_MSS,		/* Maximum segment size */
+	OSFOPT_WSO,		/* Window scale option */
+	OSFOPT_SACKP,		/* SACK permitted */
+	OSFOPT_SACK,		/* SACK */
+	OSFOPT_ECHO,
+	OSFOPT_ECHOREPLY,
+	OSFOPT_TS,		/* Timestamp option */
+	OSFOPT_POCP,		/* Partial Order Connection Permitted */
+	OSFOPT_POSP,		/* Partial Order Service Profile */
+
+	/* Others are not used in the current OSF */
+	OSFOPT_EMPTY = 255,
+};
+
+enum nf_osf_attr_type {
+	OSF_ATTR_UNSPEC,
+	OSF_ATTR_FINGER,
+	OSF_ATTR_MAX,
+};
+
+/*
+ * Add/remove fingerprint from the kernel.
+ */
+enum nf_osf_msg_types {
+	OSF_MSG_ADD,
+	OSF_MSG_REMOVE,
+	OSF_MSG_MAX,
+};
+
+#endif /* _NF_OSF_H */
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index a90e90c27cef..c56c59605c2b 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -23,7 +23,7 @@
 #include <linux/types.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
-#include <linux/netfilter/nf_osf.h>
+#include <linux/netfilter/nfnetlink_osf.h>
 
 #define XT_OSF_GENRE		NF_OSF_GENRE
 #define XT_OSF_INVERT		NF_OSF_INVERT
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index ba0fa11869ce..f9dba62c450f 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -18,7 +18,7 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/x_tables.h>
 #include <net/netfilter/nf_log.h>
-#include <linux/netfilter/nf_osf.h>
+#include <linux/netfilter/nfnetlink_osf.h>
 
 /*
  * Indexed by dont-fragment bit.
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index bdacc4cffba4..9b2f3de7be4f 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -2,7 +2,7 @@
 #include <net/tcp.h>
 
 #include <net/netfilter/nf_tables.h>
-#include <linux/netfilter/nf_osf.h>
+#include <linux/netfilter/nfnetlink_osf.h>
 
 #define OSF_GENRE_SIZE		32
 
-- 
cgit v1.2.3


From e9697e2effad50c0081b3c72002d3975f8ab4347 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 3 Aug 2018 12:38:39 +0200
Subject: l2tp: ignore L2TP_ATTR_MTU

This attribute's handling is broken. It can only be used when creating
Ethernet pseudo-wires, in which case its value can be used as the
initial MTU for the l2tpeth device.
However, when handling update requests, L2TP_ATTR_MTU only modifies
session->mtu. This value is never propagated to the l2tpeth device.
Dump requests also return the value of session->mtu, which is not
synchronised anymore with the device MTU.

The same problem occurs if the device MTU is properly updated using the
generic IFLA_MTU attribute. In this case, session->mtu is not updated,
and L2TP_ATTR_MTU will report an invalid value again when dumping the
session.

It does not seem worthwhile to complexify l2tp_eth.c to synchronise
session->mtu with the device MTU. Even the ip-l2tp manpage advises to
use 'ip link' to initialise the MTU of l2tpeth devices (iproute2 does
not handle L2TP_ATTR_MTU at all anyway). So let's just ignore it
entirely.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h |  2 +-
 net/l2tp/l2tp_core.c      |  1 -
 net/l2tp/l2tp_core.h      |  2 --
 net/l2tp/l2tp_debugfs.c   |  3 +--
 net/l2tp/l2tp_eth.c       | 17 +++++++----------
 net/l2tp/l2tp_netlink.c   |  9 +--------
 6 files changed, 10 insertions(+), 24 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index 8bb8c7cfabe5..61158f5a1a5b 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -119,7 +119,7 @@ enum {
 	L2TP_ATTR_IP_DADDR,		/* u32 */
 	L2TP_ATTR_UDP_SPORT,		/* u16 */
 	L2TP_ATTR_UDP_DPORT,		/* u16 */
-	L2TP_ATTR_MTU,			/* u16 */
+	L2TP_ATTR_MTU,			/* u16 (not used) */
 	L2TP_ATTR_MRU,			/* u16 (not used) */
 	L2TP_ATTR_STATS,		/* nested */
 	L2TP_ATTR_IP6_SADDR,		/* struct in6_addr */
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index c61a467fd9b8..ac6a00bcec71 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1674,7 +1674,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
 		if (cfg) {
 			session->pwtype = cfg->pw_type;
 			session->debug = cfg->debug;
-			session->mtu = cfg->mtu;
 			session->send_seq = cfg->send_seq;
 			session->recv_seq = cfg->recv_seq;
 			session->lns_mode = cfg->lns_mode;
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 1ca39629031b..5804065dfbfb 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -64,7 +64,6 @@ struct l2tp_session_cfg {
 	int			peer_cookie_len; /* 0, 4 or 8 bytes */
 	int			reorder_timeout; /* configured reorder timeout
 						  * (in jiffies) */
-	int			mtu;
 	char			*ifname;
 };
 
@@ -108,7 +107,6 @@ struct l2tp_session {
 	int			reorder_timeout; /* configured reorder timeout
 						  * (in jiffies) */
 	int			reorder_skip;	/* set if skip to next nr */
-	int			mtu;
 	enum l2tp_pwtype	pwtype;
 	struct l2tp_stats	stats;
 	struct hlist_node	global_hlist;	/* Global hash list node */
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index aee271741f5b..9821a1458555 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -191,8 +191,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
 	if (session->send_seq || session->recv_seq)
 		seq_printf(m, "   nr %hu, ns %hu\n", session->nr, session->ns);
 	seq_printf(m, "   refcnt %d\n", refcount_read(&session->ref_count));
-	seq_printf(m, "   config %d/0/%c/%c/-/%s %08x %u\n",
-		   session->mtu,
+	seq_printf(m, "   config 0/0/%c/%c/-/%s %08x %u\n",
 		   session->recv_seq ? 'R' : '-',
 		   session->send_seq ? 'S' : '-',
 		   session->lns_mode ? "LNS" : "LAC",
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index cfca5e63ae31..3728986ec885 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -234,14 +234,11 @@ static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
 		overhead += sizeof(struct udphdr);
 		dev->needed_headroom += sizeof(struct udphdr);
 	}
-	if (session->mtu != 0) {
-		dev->mtu = session->mtu;
-		dev->needed_headroom += session->hdr_len;
-		return;
-	}
+
 	lock_sock(tunnel->sock);
 	l3_overhead = kernel_sock_ip_overhead(tunnel->sock);
 	release_sock(tunnel->sock);
+
 	if (l3_overhead == 0) {
 		/* L3 Overhead couldn't be identified, this could be
 		 * because tunnel->sock was NULL or the socket's
@@ -255,12 +252,12 @@ static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
 	 */
 	overhead += session->hdr_len + ETH_HLEN + l3_overhead;
 
-	/* If PMTU discovery was enabled, use discovered MTU on L2TP device */
-	mtu = l2tp_tunnel_dst_mtu(tunnel);
-	if (mtu)
+	mtu = l2tp_tunnel_dst_mtu(tunnel) - overhead;
+	if (mtu < dev->min_mtu || mtu > dev->max_mtu)
+		dev->mtu = ETH_DATA_LEN - overhead;
+	else
 		dev->mtu = mtu;
-	session->mtu = dev->mtu - overhead;
-	dev->mtu = session->mtu;
+
 	dev->needed_headroom += session->hdr_len;
 }
 
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index a7c409215336..2e1e92651545 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -608,9 +608,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
 	if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
 		cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
 
-	if (info->attrs[L2TP_ATTR_MTU])
-		cfg.mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
-
 #ifdef CONFIG_MODULES
 	if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) {
 		genl_unlock();
@@ -698,9 +695,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
 	if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
 		session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
 
-	if (info->attrs[L2TP_ATTR_MTU])
-		session->mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
-
 	ret = l2tp_session_notify(&l2tp_nl_family, info,
 				  session, L2TP_CMD_SESSION_MODIFY);
 
@@ -730,8 +724,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
 	    nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID,
 			session->peer_session_id) ||
 	    nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) ||
-	    nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype) ||
-	    nla_put_u16(skb, L2TP_ATTR_MTU, session->mtu))
+	    nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype))
 		goto nla_put_failure;
 
 	if ((session->ifname[0] &&
-- 
cgit v1.2.3


From af308b94a2a4a5a27bec9028354c4df444a7c8ba Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 2 Aug 2018 20:51:39 +0200
Subject: netfilter: nf_tables: add tunnel support

This patch implements the tunnel object type that can be used to
configure tunnels via metadata template through the existing lightweight
API from the ingress path.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  69 ++++-
 net/core/dst.c                           |   1 +
 net/netfilter/Kconfig                    |   6 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_tunnel.c               | 458 +++++++++++++++++++++++++++++++
 5 files changed, 534 insertions(+), 1 deletion(-)
 create mode 100644 net/netfilter/nft_tunnel.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index f112ea52dc1a..3ee1198eeac1 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1416,7 +1416,8 @@ enum nft_ct_helper_attributes {
 #define NFT_OBJECT_CT_HELPER	3
 #define NFT_OBJECT_LIMIT	4
 #define NFT_OBJECT_CONNLIMIT	5
-#define __NFT_OBJECT_MAX	6
+#define NFT_OBJECT_TUNNEL	6
+#define __NFT_OBJECT_MAX	7
 #define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
@@ -1580,4 +1581,70 @@ enum nft_ng_types {
 };
 #define NFT_NG_MAX	(__NFT_NG_MAX - 1)
 
+enum nft_tunnel_key_ip_attributes {
+	NFTA_TUNNEL_KEY_IP_UNSPEC,
+	NFTA_TUNNEL_KEY_IP_SRC,
+	NFTA_TUNNEL_KEY_IP_DST,
+	__NFTA_TUNNEL_KEY_IP_MAX
+};
+#define NFTA_TUNNEL_KEY_IP_MAX	(__NFTA_TUNNEL_KEY_IP_MAX - 1)
+
+enum nft_tunnel_ip6_attributes {
+	NFTA_TUNNEL_KEY_IP6_UNSPEC,
+	NFTA_TUNNEL_KEY_IP6_SRC,
+	NFTA_TUNNEL_KEY_IP6_DST,
+	NFTA_TUNNEL_KEY_IP6_FLOWLABEL,
+	__NFTA_TUNNEL_KEY_IP6_MAX
+};
+#define NFTA_TUNNEL_KEY_IP6_MAX	(__NFTA_TUNNEL_KEY_IP6_MAX - 1)
+
+enum nft_tunnel_opts_attributes {
+	NFTA_TUNNEL_KEY_OPTS_UNSPEC,
+	NFTA_TUNNEL_KEY_OPTS_VXLAN,
+	NFTA_TUNNEL_KEY_OPTS_ERSPAN,
+	__NFTA_TUNNEL_KEY_OPTS_MAX
+};
+#define NFTA_TUNNEL_KEY_OPTS_MAX	(__NFTA_TUNNEL_KEY_OPTS_MAX - 1)
+
+enum nft_tunnel_opts_vxlan_attributes {
+	NFTA_TUNNEL_KEY_VXLAN_UNSPEC,
+	NFTA_TUNNEL_KEY_VXLAN_GBP,
+	__NFTA_TUNNEL_KEY_VXLAN_MAX
+};
+#define NFTA_TUNNEL_KEY_VXLAN_MAX	(__NFTA_TUNNEL_KEY_VXLAN_MAX - 1)
+
+enum nft_tunnel_opts_erspan_attributes {
+	NFTA_TUNNEL_KEY_ERSPAN_UNSPEC,
+	NFTA_TUNNEL_KEY_ERSPAN_VERSION,
+	NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX,
+	NFTA_TUNNEL_KEY_ERSPAN_V2_HWID,
+	NFTA_TUNNEL_KEY_ERSPAN_V2_DIR,
+	__NFTA_TUNNEL_KEY_ERSPAN_MAX
+};
+#define NFTA_TUNNEL_KEY_ERSPAN_MAX	(__NFTA_TUNNEL_KEY_ERSPAN_MAX - 1)
+
+enum nft_tunnel_flags {
+	NFT_TUNNEL_F_ZERO_CSUM_TX	= (1 << 0),
+	NFT_TUNNEL_F_DONT_FRAGMENT	= (1 << 1),
+	NFT_TUNNEL_F_SEQ_NUMBER		= (1 << 2),
+};
+#define NFT_TUNNEL_F_MASK	(NFT_TUNNEL_F_ZERO_CSUM_TX | \
+				 NFT_TUNNEL_F_DONT_FRAGMENT | \
+				 NFT_TUNNEL_F_SEQ_NUMBER)
+
+enum nft_tunnel_key_attributes {
+	NFTA_TUNNEL_KEY_UNSPEC,
+	NFTA_TUNNEL_KEY_ID,
+	NFTA_TUNNEL_KEY_IP,
+	NFTA_TUNNEL_KEY_IP6,
+	NFTA_TUNNEL_KEY_FLAGS,
+	NFTA_TUNNEL_KEY_TOS,
+	NFTA_TUNNEL_KEY_TTL,
+	NFTA_TUNNEL_KEY_SPORT,
+	NFTA_TUNNEL_KEY_DPORT,
+	NFTA_TUNNEL_KEY_OPTS,
+	__NFTA_TUNNEL_KEY_MAX
+};
+#define NFTA_TUNNEL_KEY_MAX	(__NFTA_TUNNEL_KEY_MAX - 1)
+
 #endif /* _LINUX_NF_TABLES_H */
diff --git a/net/core/dst.c b/net/core/dst.c
index 2d9b37f8944a..81ccf20e2826 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -307,6 +307,7 @@ void metadata_dst_free(struct metadata_dst *md_dst)
 #endif
 	kfree(md_dst);
 }
+EXPORT_SYMBOL_GPL(metadata_dst_free);
 
 struct metadata_dst __percpu *
 metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 55e399d5af10..654588088676 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -559,6 +559,12 @@ config NFT_NAT
 	  This option adds the "nat" expression that you can use to perform
 	  typical Network Address Translation (NAT) packet transformations.
 
+config NFT_TUNNEL
+	tristate "Netfilter nf_tables tunnel module"
+	help
+	  This option adds the "tunnel" expression that you can use to set
+	  tunneling policies.
+
 config NFT_OBJREF
 	tristate "Netfilter nf_tables stateful object reference module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index cf61615cc529..16895e045b66 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_NFT_QUEUE)		+= nft_queue.o
 obj-$(CONFIG_NFT_QUOTA)		+= nft_quota.o
 obj-$(CONFIG_NFT_REJECT) 	+= nft_reject.o
 obj-$(CONFIG_NFT_REJECT_INET)	+= nft_reject_inet.o
+obj-$(CONFIG_NFT_TUNNEL)	+= nft_tunnel.o
 obj-$(CONFIG_NFT_COUNTER)	+= nft_counter.o
 obj-$(CONFIG_NFT_LOG)		+= nft_log.o
 obj-$(CONFIG_NFT_MASQ)		+= nft_masq.o
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
new file mode 100644
index 000000000000..715613d99c20
--- /dev/null
+++ b/net/netfilter/nft_tunnel.c
@@ -0,0 +1,458 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/seqlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/dst_metadata.h>
+#include <net/ip_tunnels.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
+
+struct nft_tunnel_opts {
+	union {
+		struct vxlan_metadata	vxlan;
+		struct erspan_metadata	erspan;
+	} u;
+	u32	len;
+	u32	flags;
+};
+
+struct nft_tunnel_obj {
+	struct metadata_dst	*md;
+	struct nft_tunnel_opts	opts;
+};
+
+static const struct nla_policy nft_tunnel_ip_policy[NFTA_TUNNEL_KEY_IP_MAX + 1] = {
+	[NFTA_TUNNEL_KEY_IP_SRC]	= { .type = NLA_U32 },
+	[NFTA_TUNNEL_KEY_IP_DST]	= { .type = NLA_U32 },
+};
+
+static int nft_tunnel_obj_ip_init(const struct nft_ctx *ctx,
+				  const struct nlattr *attr,
+				  struct ip_tunnel_info *info)
+{
+	struct nlattr *tb[NFTA_TUNNEL_KEY_IP_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP_MAX, attr,
+			       nft_tunnel_ip_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_TUNNEL_KEY_IP_DST])
+		return -EINVAL;
+
+	if (tb[NFTA_TUNNEL_KEY_IP_SRC])
+		info->key.u.ipv4.src = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_SRC]);
+	if (tb[NFTA_TUNNEL_KEY_IP_DST])
+		info->key.u.ipv4.dst = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_DST]);
+
+	return 0;
+}
+
+static const struct nla_policy nft_tunnel_ip6_policy[NFTA_TUNNEL_KEY_IP6_MAX + 1] = {
+	[NFTA_TUNNEL_KEY_IP6_SRC]	= { .len = sizeof(struct in6_addr), },
+	[NFTA_TUNNEL_KEY_IP6_DST]	= { .len = sizeof(struct in6_addr), },
+	[NFTA_TUNNEL_KEY_IP6_FLOWLABEL]	= { .type = NLA_U32, }
+};
+
+static int nft_tunnel_obj_ip6_init(const struct nft_ctx *ctx,
+				   const struct nlattr *attr,
+				   struct ip_tunnel_info *info)
+{
+	struct nlattr *tb[NFTA_TUNNEL_KEY_IP6_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP6_MAX, attr,
+			       nft_tunnel_ip6_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_TUNNEL_KEY_IP6_DST])
+		return -EINVAL;
+
+	if (tb[NFTA_TUNNEL_KEY_IP6_SRC]) {
+		memcpy(&info->key.u.ipv6.src,
+		       nla_data(tb[NFTA_TUNNEL_KEY_IP6_SRC]),
+		       sizeof(struct in6_addr));
+	}
+	if (tb[NFTA_TUNNEL_KEY_IP6_DST]) {
+		memcpy(&info->key.u.ipv6.dst,
+		       nla_data(tb[NFTA_TUNNEL_KEY_IP6_DST]),
+		       sizeof(struct in6_addr));
+	}
+	if (tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL])
+		info->key.label = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL]);
+
+	info->mode |= IP_TUNNEL_INFO_IPV6;
+
+	return 0;
+}
+
+static const struct nla_policy nft_tunnel_opts_vxlan_policy[NFTA_TUNNEL_KEY_VXLAN_MAX + 1] = {
+	[NFTA_TUNNEL_KEY_VXLAN_GBP]	= { .type = NLA_U32 },
+};
+
+static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr,
+				     struct nft_tunnel_opts *opts)
+{
+	struct nlattr *tb[NFTA_TUNNEL_KEY_VXLAN_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_VXLAN_MAX, attr,
+			       nft_tunnel_opts_vxlan_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_TUNNEL_KEY_VXLAN_GBP])
+		return -EINVAL;
+
+	opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP]));
+
+	opts->len	= sizeof(struct vxlan_metadata);
+	opts->flags	= TUNNEL_VXLAN_OPT;
+
+	return 0;
+}
+
+static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = {
+	[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX]	= { .type = NLA_U32 },
+	[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR]	= { .type = NLA_U8 },
+	[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID]	= { .type = NLA_U8 },
+};
+
+static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
+				      struct nft_tunnel_opts *opts)
+{
+	struct nlattr *tb[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1];
+	uint8_t hwid, dir;
+	int err, version;
+
+	err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_ERSPAN_MAX, attr,
+			       nft_tunnel_opts_erspan_policy, NULL);
+	if (err < 0)
+		return err;
+
+	version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION]));
+	switch (version) {
+	case ERSPAN_VERSION:
+		if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX])
+			return -EINVAL;
+
+		opts->u.erspan.u.index =
+			nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX]);
+		break;
+	case ERSPAN_VERSION2:
+		if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] ||
+		    !tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID])
+			return -EINVAL;
+
+		hwid = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID]);
+		dir = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR]);
+
+		set_hwid(&opts->u.erspan.u.md2, hwid);
+		opts->u.erspan.u.md2.dir = dir;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+	opts->u.erspan.version = version;
+
+	opts->len	= sizeof(struct erspan_metadata);
+	opts->flags	= TUNNEL_ERSPAN_OPT;
+
+	return 0;
+}
+
+static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = {
+	[NFTA_TUNNEL_KEY_OPTS_VXLAN]	= { .type = NLA_NESTED, },
+	[NFTA_TUNNEL_KEY_OPTS_ERSPAN]	= { .type = NLA_NESTED, },
+};
+
+static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
+				    const struct nlattr *attr,
+				    struct ip_tunnel_info *info,
+				    struct nft_tunnel_opts *opts)
+{
+	struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr,
+			       nft_tunnel_opts_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_TUNNEL_KEY_OPTS_VXLAN]) {
+		err = nft_tunnel_obj_vxlan_init(tb[NFTA_TUNNEL_KEY_OPTS_VXLAN],
+						opts);
+	} else if (tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN]) {
+		err = nft_tunnel_obj_erspan_init(tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN],
+						 opts);
+	} else {
+		return -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
+static const struct nla_policy nft_tunnel_key_policy[NFTA_TUNNEL_KEY_MAX + 1] = {
+	[NFTA_TUNNEL_KEY_IP]	= { .type = NLA_NESTED, },
+	[NFTA_TUNNEL_KEY_IP6]	= { .type = NLA_NESTED, },
+	[NFTA_TUNNEL_KEY_ID]	= { .type = NLA_U32, },
+	[NFTA_TUNNEL_KEY_FLAGS]	= { .type = NLA_U32, },
+	[NFTA_TUNNEL_KEY_TOS]	= { .type = NLA_U8, },
+	[NFTA_TUNNEL_KEY_TTL]	= { .type = NLA_U8, },
+	[NFTA_TUNNEL_KEY_OPTS]	= { .type = NLA_NESTED, },
+};
+
+static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
+			       const struct nlattr * const tb[],
+			       struct nft_object *obj)
+{
+	struct nft_tunnel_obj *priv = nft_obj_data(obj);
+	struct ip_tunnel_info info;
+	struct metadata_dst *md;
+	int err;
+
+	if (!tb[NFTA_TUNNEL_KEY_ID])
+		return -EINVAL;
+
+	memset(&info, 0, sizeof(info));
+	info.mode		= IP_TUNNEL_INFO_TX;
+	info.key.tun_id		= key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID]));
+	info.key.tun_flags	= TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+
+	if (tb[NFTA_TUNNEL_KEY_IP]) {
+		err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info);
+		if (err < 0)
+			return err;
+	} else if (tb[NFTA_TUNNEL_KEY_IP6]) {
+		err = nft_tunnel_obj_ip6_init(ctx, tb[NFTA_TUNNEL_KEY_IP6], &info);
+		if (err < 0)
+			return err;
+	} else {
+		return -EINVAL;
+	}
+
+	if (tb[NFTA_TUNNEL_KEY_SPORT]) {
+		info.key.tp_src =
+			ntohs(nla_get_be16(tb[NFTA_TUNNEL_KEY_SPORT]));
+	}
+	if (tb[NFTA_TUNNEL_KEY_DPORT]) {
+		info.key.tp_dst =
+			ntohs(nla_get_be16(tb[NFTA_TUNNEL_KEY_DPORT]));
+	}
+
+	if (tb[NFTA_TUNNEL_KEY_FLAGS]) {
+		u32 tun_flags;
+
+		tun_flags = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_FLAGS]));
+		if (tun_flags & ~NFT_TUNNEL_F_MASK)
+			return -EOPNOTSUPP;
+
+		if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX)
+			info.key.tun_flags &= ~TUNNEL_CSUM;
+		if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT)
+			info.key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+		if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER)
+			info.key.tun_flags |= TUNNEL_SEQ;
+	}
+	if (tb[NFTA_TUNNEL_KEY_TOS])
+		info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]);
+	if (tb[NFTA_TUNNEL_KEY_TTL])
+		info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]);
+	else
+		info.key.ttl = U8_MAX;
+
+	if (tb[NFTA_TUNNEL_KEY_OPTS]) {
+		err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS],
+					       &info, &priv->opts);
+		if (err < 0)
+			return err;
+	}
+
+	md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL);
+	if (!md)
+		return -ENOMEM;
+
+	memcpy(&md->u.tun_info, &info, sizeof(info));
+	ip_tunnel_info_opts_set(&md->u.tun_info, &priv->opts.u, priv->opts.len,
+				priv->opts.flags);
+	priv->md = md;
+
+	return 0;
+}
+
+static inline void nft_tunnel_obj_eval(struct nft_object *obj,
+				       struct nft_regs *regs,
+				       const struct nft_pktinfo *pkt)
+{
+	struct nft_tunnel_obj *priv = nft_obj_data(obj);
+	struct sk_buff *skb = pkt->skb;
+
+	skb_dst_drop(skb);
+	dst_hold((struct dst_entry *) priv->md);
+	skb_dst_set(skb, (struct dst_entry *) priv->md);
+}
+
+static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info)
+{
+	struct nlattr *nest;
+
+	if (info->mode & IP_TUNNEL_INFO_IPV6) {
+		nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP6);
+		if (!nest)
+			return -1;
+
+		if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 ||
+		    nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 ||
+		    nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label))
+			return -1;
+
+		nla_nest_end(skb, nest);
+	} else {
+		nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP);
+		if (!nest)
+			return -1;
+
+		if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 ||
+		    nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0)
+			return -1;
+
+		nla_nest_end(skb, nest);
+	}
+
+	return 0;
+}
+
+static int nft_tunnel_opts_dump(struct sk_buff *skb,
+				struct nft_tunnel_obj *priv)
+{
+	struct nft_tunnel_opts *opts = &priv->opts;
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_OPTS);
+	if (!nest)
+		return -1;
+
+	if (opts->flags & TUNNEL_VXLAN_OPT) {
+		if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP,
+				 htonl(opts->u.vxlan.gbp)))
+			return -1;
+	} else if (opts->flags & TUNNEL_ERSPAN_OPT) {
+		switch (opts->u.erspan.version) {
+		case ERSPAN_VERSION:
+			if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX,
+					 opts->u.erspan.u.index))
+				return -1;
+			break;
+		case ERSPAN_VERSION2:
+			if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID,
+				       get_hwid(&opts->u.erspan.u.md2)) ||
+			    nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR,
+				       opts->u.erspan.u.md2.dir))
+				return -1;
+			break;
+		}
+	}
+	nla_nest_end(skb, nest);
+
+	return 0;
+}
+
+static int nft_tunnel_ports_dump(struct sk_buff *skb,
+				 struct ip_tunnel_info *info)
+{
+	if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 ||
+	    nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0)
+		return -1;
+
+	return 0;
+}
+
+static int nft_tunnel_flags_dump(struct sk_buff *skb,
+				 struct ip_tunnel_info *info)
+{
+	u32 flags = 0;
+
+	if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+		flags |= NFT_TUNNEL_F_DONT_FRAGMENT;
+	if (!(info->key.tun_flags & TUNNEL_CSUM))
+		flags |= NFT_TUNNEL_F_ZERO_CSUM_TX;
+	if (info->key.tun_flags & TUNNEL_SEQ)
+		flags |= NFT_TUNNEL_F_SEQ_NUMBER;
+
+	if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0)
+		return -1;
+
+	return 0;
+}
+
+static int nft_tunnel_obj_dump(struct sk_buff *skb,
+			       struct nft_object *obj, bool reset)
+{
+	struct nft_tunnel_obj *priv = nft_obj_data(obj);
+	struct ip_tunnel_info *info = &priv->md->u.tun_info;
+
+	if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ID,
+			 tunnel_id_to_key32(info->key.tun_id)) ||
+	    nft_tunnel_ip_dump(skb, info) < 0 ||
+	    nft_tunnel_ports_dump(skb, info) < 0 ||
+	    nft_tunnel_flags_dump(skb, info) < 0 ||
+	    nla_put_u8(skb, NFTA_TUNNEL_KEY_TOS, info->key.tos) ||
+	    nla_put_u8(skb, NFTA_TUNNEL_KEY_TTL, info->key.ttl) ||
+	    nft_tunnel_opts_dump(skb, priv) < 0)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static void nft_tunnel_obj_destroy(const struct nft_ctx *ctx,
+				   struct nft_object *obj)
+{
+	struct nft_tunnel_obj *priv = nft_obj_data(obj);
+
+	metadata_dst_free(priv->md);
+}
+
+static struct nft_object_type nft_tunnel_obj_type;
+static const struct nft_object_ops nft_tunnel_obj_ops = {
+	.type		= &nft_tunnel_obj_type,
+	.size		= sizeof(struct nft_tunnel_obj),
+	.eval		= nft_tunnel_obj_eval,
+	.init		= nft_tunnel_obj_init,
+	.destroy	= nft_tunnel_obj_destroy,
+	.dump		= nft_tunnel_obj_dump,
+};
+
+static struct nft_object_type nft_tunnel_obj_type __read_mostly = {
+	.type		= NFT_OBJECT_TUNNEL,
+	.ops		= &nft_tunnel_obj_ops,
+	.maxattr	= NFTA_TUNNEL_KEY_MAX,
+	.policy		= nft_tunnel_key_policy,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_tunnel_module_init(void)
+{
+	return nft_register_obj(&nft_tunnel_obj_type);
+}
+
+static void __exit nft_tunnel_module_exit(void)
+{
+	nft_unregister_obj(&nft_tunnel_obj_type);
+}
+
+module_init(nft_tunnel_module_init);
+module_exit(nft_tunnel_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL);
-- 
cgit v1.2.3


From aaecfdb5c5dd8bac2dfd112166844a9f2d5711f0 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 2 Aug 2018 20:51:46 +0200
Subject: netfilter: nf_tables: match on tunnel metadata

This patch allows us to match on the tunnel metadata that is available
of the packet. We can use this to validate if the packet comes from/goes
to tunnel and the corresponding tunnel ID.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  15 +++++
 net/netfilter/nft_tunnel.c               | 112 ++++++++++++++++++++++++++++++-
 2 files changed, 126 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 3ee1198eeac1..357862d948de 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1647,4 +1647,19 @@ enum nft_tunnel_key_attributes {
 };
 #define NFTA_TUNNEL_KEY_MAX	(__NFTA_TUNNEL_KEY_MAX - 1)
 
+enum nft_tunnel_keys {
+	NFT_TUNNEL_PATH,
+	NFT_TUNNEL_ID,
+	__NFT_TUNNEL_MAX
+};
+#define NFT_TUNNEL_MAX	(__NFT_TUNNEL_MAX - 1)
+
+enum nft_tunnel_attributes {
+	NFTA_TUNNEL_UNSPEC,
+	NFTA_TUNNEL_KEY,
+	NFTA_TUNNEL_DREG,
+	__NFTA_TUNNEL_MAX
+};
+#define NFTA_TUNNEL_MAX	(__NFTA_TUNNEL_MAX - 1)
+
 #endif /* _LINUX_NF_TABLES_H */
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 715613d99c20..9332d7933dd5 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -12,6 +12,104 @@
 #include <net/vxlan.h>
 #include <net/erspan.h>
 
+struct nft_tunnel {
+	enum nft_tunnel_keys	key:8;
+	enum nft_registers	dreg:8;
+};
+
+static void nft_tunnel_get_eval(const struct nft_expr *expr,
+				struct nft_regs *regs,
+				const struct nft_pktinfo *pkt)
+{
+	const struct nft_tunnel *priv = nft_expr_priv(expr);
+	u32 *dest = &regs->data[priv->dreg];
+	struct ip_tunnel_info *tun_info;
+
+	tun_info = skb_tunnel_info(pkt->skb);
+
+	switch (priv->key) {
+	case NFT_TUNNEL_PATH:
+		nft_reg_store8(dest, !!tun_info);
+		break;
+	case NFT_TUNNEL_ID:
+		if (!tun_info) {
+			regs->verdict.code = NFT_BREAK;
+			return;
+		}
+		*dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id));
+		break;
+	default:
+		WARN_ON(1);
+		regs->verdict.code = NFT_BREAK;
+	}
+}
+
+static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = {
+	[NFTA_TUNNEL_KEY]	= { .type = NLA_U32 },
+	[NFTA_TUNNEL_DREG]	= { .type = NLA_U32 },
+};
+
+static int nft_tunnel_get_init(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nlattr * const tb[])
+{
+	struct nft_tunnel *priv = nft_expr_priv(expr);
+	u32 len;
+
+	if (!tb[NFTA_TUNNEL_KEY] &&
+	    !tb[NFTA_TUNNEL_DREG])
+		return -EINVAL;
+
+	priv->key = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY]));
+	switch (priv->key) {
+	case NFT_TUNNEL_PATH:
+		len = sizeof(u8);
+		break;
+	case NFT_TUNNEL_ID:
+		len = sizeof(u32);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]);
+
+	return nft_validate_register_store(ctx, priv->dreg, NULL,
+					   NFT_DATA_VALUE, len);
+}
+
+static int nft_tunnel_get_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr)
+{
+	const struct nft_tunnel *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_TUNNEL_KEY, htonl(priv->key)))
+		goto nla_put_failure;
+	if (nft_dump_register(skb, NFTA_TUNNEL_DREG, priv->dreg))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_tunnel_type;
+static const struct nft_expr_ops nft_tunnel_get_ops = {
+	.type		= &nft_tunnel_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_tunnel)),
+	.eval		= nft_tunnel_get_eval,
+	.init		= nft_tunnel_get_init,
+	.dump		= nft_tunnel_get_dump,
+};
+
+static struct nft_expr_type nft_tunnel_type __read_mostly = {
+	.name		= "tunnel",
+	.ops		= &nft_tunnel_get_ops,
+	.policy		= nft_tunnel_policy,
+	.maxattr	= NFTA_TUNNEL_MAX,
+	.owner		= THIS_MODULE,
+};
+
 struct nft_tunnel_opts {
 	union {
 		struct vxlan_metadata	vxlan;
@@ -442,12 +540,23 @@ static struct nft_object_type nft_tunnel_obj_type __read_mostly = {
 
 static int __init nft_tunnel_module_init(void)
 {
-	return nft_register_obj(&nft_tunnel_obj_type);
+	int err;
+
+	err = nft_register_expr(&nft_tunnel_type);
+	if (err < 0)
+		return err;
+
+	err = nft_register_obj(&nft_tunnel_obj_type);
+	if (err < 0)
+		nft_unregister_expr(&nft_tunnel_type);
+
+	return err;
 }
 
 static void __exit nft_tunnel_module_exit(void)
 {
 	nft_unregister_obj(&nft_tunnel_obj_type);
+	nft_unregister_expr(&nft_tunnel_type);
 }
 
 module_init(nft_tunnel_module_init);
@@ -455,4 +564,5 @@ module_exit(nft_tunnel_module_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("tunnel");
 MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL);
-- 
cgit v1.2.3


From 94276fa8a2a4c08ccb2e9d55e88b95dc972ccea3 Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Fri, 3 Aug 2018 13:36:13 +0200
Subject: netfilter: bridge: Expose nf_tables bridge hook priorities through
 uapi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Netfilter exposes standard hook priorities in case of ipv4, ipv6 and
arp but not in case of bridge.

This patch exposes the hook priority values of the bridge family (which are
different from the formerly mentioned) via uapi so that they can be used by
user-space applications just like the others.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge.h      | 11 -----------
 include/uapi/linux/netfilter_bridge.h | 11 +++++++++++
 net/bridge/br_netfilter_hooks.c       |  1 +
 net/bridge/netfilter/ebtable_filter.c |  1 +
 net/bridge/netfilter/ebtable_nat.c    |  1 +
 5 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index b671fdfd212b..fa0686500970 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -5,17 +5,6 @@
 #include <uapi/linux/netfilter_bridge.h>
 #include <linux/skbuff.h>
 
-enum nf_br_hook_priorities {
-	NF_BR_PRI_FIRST = INT_MIN,
-	NF_BR_PRI_NAT_DST_BRIDGED = -300,
-	NF_BR_PRI_FILTER_BRIDGED = -200,
-	NF_BR_PRI_BRNF = 0,
-	NF_BR_PRI_NAT_DST_OTHER = 100,
-	NF_BR_PRI_FILTER_OTHER = 200,
-	NF_BR_PRI_NAT_SRC = 300,
-	NF_BR_PRI_LAST = INT_MAX,
-};
-
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 
 int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h
index 12fb77633f83..156ccd089df1 100644
--- a/include/uapi/linux/netfilter_bridge.h
+++ b/include/uapi/linux/netfilter_bridge.h
@@ -26,4 +26,15 @@
 #define NF_BR_BROUTING		5
 #define NF_BR_NUMHOOKS		6
 
+enum nf_br_hook_priorities {
+	NF_BR_PRI_FIRST = INT_MIN,
+	NF_BR_PRI_NAT_DST_BRIDGED = -300,
+	NF_BR_PRI_FILTER_BRIDGED = -200,
+	NF_BR_PRI_BRNF = 0,
+	NF_BR_PRI_NAT_DST_OTHER = 100,
+	NF_BR_PRI_FILTER_OTHER = 200,
+	NF_BR_PRI_NAT_SRC = 300,
+	NF_BR_PRI_LAST = INT_MAX,
+};
+
 #endif /* _UAPI__LINUX_BRIDGE_NETFILTER_H */
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 9b16eaf33819..6e0dc6bcd32a 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -26,6 +26,7 @@
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
 #include <linux/netfilter_bridge.h>
+#include <uapi/linux/netfilter_bridge.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv6.h>
 #include <linux/netfilter_arp.h>
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index c41da5fac84f..550324c516ee 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/netfilter_bridge/ebtables.h>
+#include <uapi/linux/netfilter_bridge.h>
 #include <linux/module.h>
 
 #define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 08df7406ecb3..c0fb3ca518af 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/netfilter_bridge/ebtables.h>
+#include <uapi/linux/netfilter_bridge.h>
 #include <linux/module.h>
 
 #define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \
-- 
cgit v1.2.3


From d89d41556141a527030a15233135ba622ba3350d Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sat, 4 Aug 2018 14:20:40 -0700
Subject: ethtool: Remove trailing semicolon for static inline

Android's header sanitization tool chokes on static inline functions having a
trailing semicolon, leading to an incorrectly parsed header file. While the
tool should obviously be fixed, also fix the header files for the two affected
functions: ethtool_get_flow_spec_ring() and ethtool_get_flow_spec_ring_vf().

Fixes: 8cf6f497de40 ("ethtool: Add helper routines to pass vf to rx_flow_spec")
Reporetd-by: Blair Prescott <blair.prescott@broadcom.com>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 7363f18e65a5..813282cc8af6 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -902,13 +902,13 @@ struct ethtool_rx_flow_spec {
 static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie)
 {
 	return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie;
-};
+}
 
 static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie)
 {
 	return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >>
 				ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF;
-};
+}
 
 /**
  * struct ethtool_rxnfc - command to get or set RX flow classification rules
-- 
cgit v1.2.3


From 7969e5c40dfd04799d4341f1b7cd266b6e47f227 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Thu, 2 Aug 2018 23:34:37 +0000
Subject: ip: discard IPv4 datagrams with overlapping segments.

This behavior is required in IPv6, and there is little need
to tolerate overlapping fragments in IPv4. This change
simplifies the code and eliminates potential DDoS attack vectors.

Tested: ran ip_defrag selftest (not yet available uptream).

Suggested-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h |  1 +
 net/ipv4/ip_fragment.c    | 75 ++++++++++++-----------------------------------
 net/ipv4/proc.c           |  1 +
 3 files changed, 21 insertions(+), 56 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index e5ebc83827ab..f80135e5feaa 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -56,6 +56,7 @@ enum
 	IPSTATS_MIB_ECT1PKTS,			/* InECT1Pkts */
 	IPSTATS_MIB_ECT0PKTS,			/* InECT0Pkts */
 	IPSTATS_MIB_CEPKTS,			/* InCEPkts */
+	IPSTATS_MIB_REASM_OVERLAPS,		/* ReasmOverlaps */
 	__IPSTATS_MIB_MAX
 };
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index d14d741fb05e..960bf5eab59f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -277,6 +277,7 @@ static int ip_frag_reinit(struct ipq *qp)
 /* Add new segment to existing queue. */
 static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
+	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct sk_buff *prev, *next;
 	struct net_device *dev;
 	unsigned int fragsize;
@@ -357,65 +358,23 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	}
 
 found:
-	/* We found where to put this one.  Check for overlap with
-	 * preceding fragment, and, if needed, align things so that
-	 * any overlaps are eliminated.
+	/* RFC5722, Section 4, amended by Errata ID : 3089
+	 *                          When reassembling an IPv6 datagram, if
+	 *   one or more its constituent fragments is determined to be an
+	 *   overlapping fragment, the entire datagram (and any constituent
+	 *   fragments) MUST be silently discarded.
+	 *
+	 * We do the same here for IPv4.
 	 */
-	if (prev) {
-		int i = (prev->ip_defrag_offset + prev->len) - offset;
 
-		if (i > 0) {
-			offset += i;
-			err = -EINVAL;
-			if (end <= offset)
-				goto err;
-			err = -ENOMEM;
-			if (!pskb_pull(skb, i))
-				goto err;
-			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
-				skb->ip_summed = CHECKSUM_NONE;
-		}
-	}
+	/* Is there an overlap with the previous fragment? */
+	if (prev &&
+	    (prev->ip_defrag_offset + prev->len) > offset)
+		goto discard_qp;
 
-	err = -ENOMEM;
-
-	while (next && next->ip_defrag_offset < end) {
-		int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
-
-		if (i < next->len) {
-			int delta = -next->truesize;
-
-			/* Eat head of the next overlapped fragment
-			 * and leave the loop. The next ones cannot overlap.
-			 */
-			if (!pskb_pull(next, i))
-				goto err;
-			delta += next->truesize;
-			if (delta)
-				add_frag_mem_limit(qp->q.net, delta);
-			next->ip_defrag_offset += i;
-			qp->q.meat -= i;
-			if (next->ip_summed != CHECKSUM_UNNECESSARY)
-				next->ip_summed = CHECKSUM_NONE;
-			break;
-		} else {
-			struct sk_buff *free_it = next;
-
-			/* Old fragment is completely overridden with
-			 * new one drop it.
-			 */
-			next = next->next;
-
-			if (prev)
-				prev->next = next;
-			else
-				qp->q.fragments = next;
-
-			qp->q.meat -= free_it->len;
-			sub_frag_mem_limit(qp->q.net, free_it->truesize);
-			kfree_skb(free_it);
-		}
-	}
+	/* Is there an overlap with the next fragment? */
+	if (next && next->ip_defrag_offset < end)
+		goto discard_qp;
 
 	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
 	dev = skb->dev;
@@ -463,6 +422,10 @@ found:
 	skb_dst_drop(skb);
 	return -EINPROGRESS;
 
+discard_qp:
+	inet_frag_kill(&qp->q);
+	err = -EINVAL;
+	__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
 err:
 	kfree_skb(skb);
 	return err;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b46e4cf9a55a..70289682a670 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -119,6 +119,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
 	SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
 	SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
 	SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
+	SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
 	SNMP_MIB_SENTINEL
 };
 
-- 
cgit v1.2.3


From bfe4037e722ec672c9dafd5730d9132afeeb76e9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Jul 2018 09:08:20 +0200
Subject: aio: implement IOCB_CMD_POLL

Simple one-shot poll through the io_submit() interface.  To poll for
a file descriptor the application should submit an iocb of type
IOCB_CMD_POLL.  It will poll the fd for the events specified in the
the first 32 bits of the aio_buf field of the iocb.

Unlike poll or epoll without EPOLLONESHOT this interface always works
in one shot mode, that is once the iocb is completed, it will have to be
resubmitted.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Avi Kivity <avi@scylladb.com>
---
 fs/aio.c                     | 178 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/aio_abi.h |   6 +-
 2 files changed, 180 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/aio.c b/fs/aio.c
index fe2018ada32c..2fd19521d8a8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -5,6 +5,7 @@
  *	Implements an efficient asynchronous io interface.
  *
  *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
+ *	Copyright 2018 Christoph Hellwig.
  *
  *	See ../COPYING for licensing terms.
  */
@@ -165,10 +166,21 @@ struct fsync_iocb {
 	bool			datasync;
 };
 
+struct poll_iocb {
+	struct file		*file;
+	struct wait_queue_head	*head;
+	__poll_t		events;
+	bool			woken;
+	bool			cancelled;
+	struct wait_queue_entry	wait;
+	struct work_struct	work;
+};
+
 struct aio_kiocb {
 	union {
 		struct kiocb		rw;
 		struct fsync_iocb	fsync;
+		struct poll_iocb	poll;
 	};
 
 	struct kioctx		*ki_ctx;
@@ -1601,6 +1613,169 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
 	return 0;
 }
 
+static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
+{
+	struct file *file = iocb->poll.file;
+
+	aio_complete(iocb, mangle_poll(mask), 0);
+	fput(file);
+}
+
+static void aio_poll_complete_work(struct work_struct *work)
+{
+	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+	struct poll_table_struct pt = { ._key = req->events };
+	struct kioctx *ctx = iocb->ki_ctx;
+	__poll_t mask = 0;
+
+	if (!READ_ONCE(req->cancelled))
+		mask = vfs_poll(req->file, &pt) & req->events;
+
+	/*
+	 * Note that ->ki_cancel callers also delete iocb from active_reqs after
+	 * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
+	 * synchronize with them.  In the cancellation case the list_del_init
+	 * itself is not actually needed, but harmless so we keep it in to
+	 * avoid further branches in the fast path.
+	 */
+	spin_lock_irq(&ctx->ctx_lock);
+	if (!mask && !READ_ONCE(req->cancelled)) {
+		add_wait_queue(req->head, &req->wait);
+		spin_unlock_irq(&ctx->ctx_lock);
+		return;
+	}
+	list_del_init(&iocb->ki_list);
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	aio_poll_complete(iocb, mask);
+}
+
+/* assumes we are called with irqs disabled */
+static int aio_poll_cancel(struct kiocb *iocb)
+{
+	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
+	struct poll_iocb *req = &aiocb->poll;
+
+	spin_lock(&req->head->lock);
+	WRITE_ONCE(req->cancelled, true);
+	if (!list_empty(&req->wait.entry)) {
+		list_del_init(&req->wait.entry);
+		schedule_work(&aiocb->poll.work);
+	}
+	spin_unlock(&req->head->lock);
+
+	return 0;
+}
+
+static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+		void *key)
+{
+	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+	__poll_t mask = key_to_poll(key);
+
+	req->woken = true;
+
+	/* for instances that support it check for an event match first: */
+	if (mask && !(mask & req->events))
+		return 0;
+
+	list_del_init(&req->wait.entry);
+	schedule_work(&req->work);
+	return 1;
+}
+
+struct aio_poll_table {
+	struct poll_table_struct	pt;
+	struct aio_kiocb		*iocb;
+	int				error;
+};
+
+static void
+aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
+		struct poll_table_struct *p)
+{
+	struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
+
+	/* multiple wait queues per file are not supported */
+	if (unlikely(pt->iocb->poll.head)) {
+		pt->error = -EINVAL;
+		return;
+	}
+
+	pt->error = 0;
+	pt->iocb->poll.head = head;
+	add_wait_queue(head, &pt->iocb->poll.wait);
+}
+
+static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+{
+	struct kioctx *ctx = aiocb->ki_ctx;
+	struct poll_iocb *req = &aiocb->poll;
+	struct aio_poll_table apt;
+	__poll_t mask;
+
+	/* reject any unknown events outside the normal event mask. */
+	if ((u16)iocb->aio_buf != iocb->aio_buf)
+		return -EINVAL;
+	/* reject fields that are not defined for poll */
+	if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
+		return -EINVAL;
+
+	INIT_WORK(&req->work, aio_poll_complete_work);
+	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
+	req->file = fget(iocb->aio_fildes);
+	if (unlikely(!req->file))
+		return -EBADF;
+
+	apt.pt._qproc = aio_poll_queue_proc;
+	apt.pt._key = req->events;
+	apt.iocb = aiocb;
+	apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
+
+	/* initialized the list so that we can do list_empty checks */
+	INIT_LIST_HEAD(&req->wait.entry);
+	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
+
+	/* one for removal from waitqueue, one for this function */
+	refcount_set(&aiocb->ki_refcnt, 2);
+
+	mask = vfs_poll(req->file, &apt.pt) & req->events;
+	if (unlikely(!req->head)) {
+		/* we did not manage to set up a waitqueue, done */
+		goto out;
+	}
+
+	spin_lock_irq(&ctx->ctx_lock);
+	spin_lock(&req->head->lock);
+	if (req->woken) {
+		/* wake_up context handles the rest */
+		mask = 0;
+		apt.error = 0;
+	} else if (mask || apt.error) {
+		/* if we get an error or a mask we are done */
+		WARN_ON_ONCE(list_empty(&req->wait.entry));
+		list_del_init(&req->wait.entry);
+	} else {
+		/* actually waiting for an event */
+		list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+		aiocb->ki_cancel = aio_poll_cancel;
+	}
+	spin_unlock(&req->head->lock);
+	spin_unlock_irq(&ctx->ctx_lock);
+
+out:
+	if (unlikely(apt.error)) {
+		fput(req->file);
+		return apt.error;
+	}
+
+	if (mask)
+		aio_poll_complete(aiocb, mask);
+	iocb_put(aiocb);
+	return 0;
+}
+
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			 bool compat)
 {
@@ -1674,6 +1849,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	case IOCB_CMD_FDSYNC:
 		ret = aio_fsync(&req->fsync, &iocb, true);
 		break;
+	case IOCB_CMD_POLL:
+		ret = aio_poll(req, &iocb);
+		break;
 	default:
 		pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
 		ret = -EINVAL;
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index d4593a6062ef..ce43d340f010 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -38,10 +38,8 @@ enum {
 	IOCB_CMD_PWRITE = 1,
 	IOCB_CMD_FSYNC = 2,
 	IOCB_CMD_FDSYNC = 3,
-	/* These two are experimental.
-	 * IOCB_CMD_PREADX = 4,
-	 * IOCB_CMD_POLL = 5,
-	 */
+	/* 4 was the experimental IOCB_CMD_PREADX */
+	IOCB_CMD_POLL = 5,
 	IOCB_CMD_NOOP = 6,
 	IOCB_CMD_PREADV = 7,
 	IOCB_CMD_PWRITEV = 8,
-- 
cgit v1.2.3


From 8fcc4b5923af5de58b80b53a069453b135693304 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Tue, 10 Jul 2018 11:27:20 +0200
Subject: kvm: nVMX: Introduce KVM_CAP_NESTED_STATE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For nested virtualization L0 KVM is managing a bit of state for L2 guests,
this state can not be captured through the currently available IOCTLs. In
fact the state captured through all of these IOCTLs is usually a mix of L1
and L2 state. It is also dependent on whether the L2 guest was running at
the moment when the process was interrupted to save its state.

With this capability, there are two new vcpu ioctls: KVM_GET_NESTED_STATE
and KVM_SET_NESTED_STATE. These can be used for saving and restoring a VM
that is in VMX operation.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Jim Mattson <jmattson@google.com>
[karahmed@ - rename structs and functions and make them ready for AMD and
             address previous comments.
           - handle nested.smm state.
           - rebase & a bit of refactoring.
           - Merge 7/8 and 8/8 into one patch. ]
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  56 ++++++++++++
 arch/x86/include/asm/kvm_host.h   |   6 ++
 arch/x86/include/uapi/asm/kvm.h   |  37 ++++++++
 arch/x86/kvm/vmx.c                | 175 +++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c                |  54 ++++++++++++
 include/uapi/linux/kvm.h          |   4 +
 6 files changed, 330 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index cb8db4f9d097..7b83b176c662 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3561,6 +3561,62 @@ Returns: 0 on success,
 	-ENOENT on deassign if the conn_id isn't registered
 	-EEXIST on assign if the conn_id is already registered
 
+4.114 KVM_GET_NESTED_STATE
+
+Capability: KVM_CAP_NESTED_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_nested_state (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  E2BIG:     the total state size (including the fixed-size part of struct
+             kvm_nested_state) exceeds the value of 'size' specified by
+             the user; the size required will be written into size.
+
+struct kvm_nested_state {
+	__u16 flags;
+	__u16 format;
+	__u32 size;
+	union {
+		struct kvm_vmx_nested_state vmx;
+		struct kvm_svm_nested_state svm;
+		__u8 pad[120];
+	};
+	__u8 data[0];
+};
+
+#define KVM_STATE_NESTED_GUEST_MODE	0x00000001
+#define KVM_STATE_NESTED_RUN_PENDING	0x00000002
+
+#define KVM_STATE_NESTED_SMM_GUEST_MODE	0x00000001
+#define KVM_STATE_NESTED_SMM_VMXON	0x00000002
+
+struct kvm_vmx_nested_state {
+	__u64 vmxon_pa;
+	__u64 vmcs_pa;
+
+	struct {
+		__u16 flags;
+	} smm;
+};
+
+This ioctl copies the vcpu's nested virtualization state from the kernel to
+userspace.
+
+The maximum size of the state, including the fixed-size part of struct
+kvm_nested_state, can be retrieved by passing KVM_CAP_NESTED_STATE to
+the KVM_CHECK_EXTENSION ioctl().
+
+4.115 KVM_SET_NESTED_STATE
+
+Capability: KVM_CAP_NESTED_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_nested_state (in)
+Returns: 0 on success, -1 on error
+
+This copies the vcpu's kvm_nested_state struct from userspace to the kernel.  For
+the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE.
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da957725992d..bd287b348751 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1086,6 +1086,12 @@ struct kvm_x86_ops {
 
 	void (*setup_mce)(struct kvm_vcpu *vcpu);
 
+	int (*get_nested_state)(struct kvm_vcpu *vcpu,
+				struct kvm_nested_state __user *user_kvm_nested_state,
+				unsigned user_data_size);
+	int (*set_nested_state)(struct kvm_vcpu *vcpu,
+				struct kvm_nested_state __user *user_kvm_nested_state,
+				struct kvm_nested_state *kvm_state);
 	void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
 
 	int (*smi_allowed)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index c535c2fdea13..86299efa804a 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -378,4 +378,41 @@ struct kvm_sync_regs {
 #define KVM_X86_QUIRK_LINT0_REENABLED	(1 << 0)
 #define KVM_X86_QUIRK_CD_NW_CLEARED	(1 << 1)
 
+#define KVM_STATE_NESTED_GUEST_MODE	0x00000001
+#define KVM_STATE_NESTED_RUN_PENDING	0x00000002
+
+#define KVM_STATE_NESTED_SMM_GUEST_MODE	0x00000001
+#define KVM_STATE_NESTED_SMM_VMXON	0x00000002
+
+struct kvm_vmx_nested_state {
+	__u64 vmxon_pa;
+	__u64 vmcs_pa;
+
+	struct {
+		__u16 flags;
+	} smm;
+};
+
+/* for KVM_CAP_NESTED_STATE */
+struct kvm_nested_state {
+	/* KVM_STATE_* flags */
+	__u16 flags;
+
+	/* 0 for VMX, 1 for SVM.  */
+	__u16 format;
+
+	/* 128 for SVM, 128 + VMCS size for VMX.  */
+	__u32 size;
+
+	union {
+		/* VMXON, VMCS */
+		struct kvm_vmx_nested_state vmx;
+
+		/* Pad the header to 128 bytes.  */
+		__u8 pad[120];
+	};
+
+	__u8 data[0];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fee44e4c5c79..4be6486173b7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7589,6 +7589,11 @@ static __init int hardware_setup(void)
 	else
 		kvm_disable_tdp();
 
+	if (!nested) {
+		kvm_x86_ops->get_nested_state = NULL;
+		kvm_x86_ops->set_nested_state = NULL;
+	}
+
 	/*
 	 * Only enable PML when hardware supports PML feature, and both EPT
 	 * and EPT A/D bit features are enabled -- PML depends on them to work.
@@ -11775,8 +11780,8 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 }
 
 /*
- * If exit_qual is NULL, this is being called from RSM.
- * Otherwise it's called from vmlaunch/vmresume.
+ * If exit_qual is NULL, this is being called from state restore (either RSM
+ * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
  */
 static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
 {
@@ -13016,6 +13021,170 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
+				struct kvm_nested_state __user *user_kvm_nested_state,
+				u32 user_data_size)
+{
+	struct vcpu_vmx *vmx;
+	struct vmcs12 *vmcs12;
+	struct kvm_nested_state kvm_state = {
+		.flags = 0,
+		.format = 0,
+		.size = sizeof(kvm_state),
+		.vmx.vmxon_pa = -1ull,
+		.vmx.vmcs_pa = -1ull,
+	};
+
+	if (!vcpu)
+		return kvm_state.size + 2 * VMCS12_SIZE;
+
+	vmx = to_vmx(vcpu);
+	vmcs12 = get_vmcs12(vcpu);
+	if (nested_vmx_allowed(vcpu) &&
+	    (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
+		kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
+		kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
+
+		if (vmx->nested.current_vmptr != -1ull)
+			kvm_state.size += VMCS12_SIZE;
+
+		if (vmx->nested.smm.vmxon)
+			kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
+
+		if (vmx->nested.smm.guest_mode)
+			kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
+
+		if (is_guest_mode(vcpu)) {
+			kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+			if (vmx->nested.nested_run_pending)
+				kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+		}
+	}
+
+	if (user_data_size < kvm_state.size)
+		goto out;
+
+	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+		return -EFAULT;
+
+	if (vmx->nested.current_vmptr == -1ull)
+		goto out;
+
+	/*
+	 * When running L2, the authoritative vmcs12 state is in the
+	 * vmcs02. When running L1, the authoritative vmcs12 state is
+	 * in the shadow vmcs linked to vmcs01, unless
+	 * sync_shadow_vmcs is set, in which case, the authoritative
+	 * vmcs12 state is in the vmcs12 already.
+	 */
+	if (is_guest_mode(vcpu))
+		sync_vmcs12(vcpu, vmcs12);
+	else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
+		copy_shadow_to_vmcs12(vmx);
+
+	if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
+		return -EFAULT;
+
+out:
+	return kvm_state.size;
+}
+
+static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
+				struct kvm_nested_state __user *user_kvm_nested_state,
+				struct kvm_nested_state *kvm_state)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12;
+	u32 exit_qual;
+	int ret;
+
+	if (kvm_state->format != 0)
+		return -EINVAL;
+
+	if (!nested_vmx_allowed(vcpu))
+		return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
+
+	if (kvm_state->vmx.vmxon_pa == -1ull) {
+		if (kvm_state->vmx.smm.flags)
+			return -EINVAL;
+
+		if (kvm_state->vmx.vmcs_pa != -1ull)
+			return -EINVAL;
+
+		vmx_leave_nested(vcpu);
+		return 0;
+	}
+
+	if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
+		return -EINVAL;
+
+	if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
+		return -EINVAL;
+
+	if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
+	    !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
+		return -EINVAL;
+
+	if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+	    (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+		return -EINVAL;
+
+	if (kvm_state->vmx.smm.flags &
+	    ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
+		return -EINVAL;
+
+	if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+	    !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
+		return -EINVAL;
+
+	vmx_leave_nested(vcpu);
+	if (kvm_state->vmx.vmxon_pa == -1ull)
+		return 0;
+
+	vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
+	ret = enter_vmx_operation(vcpu);
+	if (ret)
+		return ret;
+
+	set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
+
+	if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
+		vmx->nested.smm.vmxon = true;
+		vmx->nested.vmxon = false;
+
+		if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
+			vmx->nested.smm.guest_mode = true;
+	}
+
+	vmcs12 = get_vmcs12(vcpu);
+	if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
+		return -EFAULT;
+
+	if (vmcs12->revision_id != VMCS12_REVISION)
+		return -EINVAL;
+
+	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+		return 0;
+
+	vmx->nested.nested_run_pending =
+		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+	if (check_vmentry_prereqs(vcpu, vmcs12) ||
+	    check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+		return -EINVAL;
+
+	if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
+		vmx->nested.nested_run_pending = 1;
+
+	vmx->nested.dirty_vmcs12 = true;
+	ret = enter_vmx_non_root_mode(vcpu, NULL);
+	if (ret)
+		return -EINVAL;
+
+	return 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -13150,6 +13319,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.setup_mce = vmx_setup_mce,
 
+	.get_nested_state = vmx_get_nested_state,
+	.set_nested_state = vmx_set_nested_state,
 	.get_vmcs12_pages = nested_get_vmcs12_pages,
 
 	.smi_allowed = vmx_smi_allowed,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fbd59ad047b0..1b14c4a654c3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2947,6 +2947,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_X2APIC_API:
 		r = KVM_X2APIC_API_VALID_FLAGS;
 		break;
+	case KVM_CAP_NESTED_STATE:
+		r = kvm_x86_ops->get_nested_state ?
+			kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
+		break;
 	default:
 		break;
 	}
@@ -3963,6 +3967,56 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 		break;
 	}
+	case KVM_GET_NESTED_STATE: {
+		struct kvm_nested_state __user *user_kvm_nested_state = argp;
+		u32 user_data_size;
+
+		r = -EINVAL;
+		if (!kvm_x86_ops->get_nested_state)
+			break;
+
+		BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
+		if (get_user(user_data_size, &user_kvm_nested_state->size))
+			return -EFAULT;
+
+		r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
+						  user_data_size);
+		if (r < 0)
+			return r;
+
+		if (r > user_data_size) {
+			if (put_user(r, &user_kvm_nested_state->size))
+				return -EFAULT;
+			return -E2BIG;
+		}
+		r = 0;
+		break;
+	}
+	case KVM_SET_NESTED_STATE: {
+		struct kvm_nested_state __user *user_kvm_nested_state = argp;
+		struct kvm_nested_state kvm_state;
+
+		r = -EINVAL;
+		if (!kvm_x86_ops->set_nested_state)
+			break;
+
+		if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
+			return -EFAULT;
+
+		if (kvm_state.size < sizeof(kvm_state))
+			return -EINVAL;
+
+		if (kvm_state.flags &
+		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
+			return -EINVAL;
+
+		/* nested_run_pending implies guest_mode.  */
+		if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
+			return -EINVAL;
+
+		r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index b955b986b341..3cf632839337 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -950,6 +950,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_HYPERV_EVENTFD 154
 #define KVM_CAP_HYPERV_TLBFLUSH 155
 #define KVM_CAP_S390_HPAGE_1M 156
+#define KVM_CAP_NESTED_STATE 157
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1392,6 +1393,9 @@ struct kvm_enc_region {
 /* Available with KVM_CAP_HYPERV_EVENTFD */
 #define KVM_HYPERV_EVENTFD        _IOW(KVMIO,  0xbd, struct kvm_hyperv_eventfd)
 
+/* Available with KVM_CAP_NESTED_STATE */
+#define KVM_GET_NESTED_STATE         _IOWR(KVMIO, 0xbe, struct kvm_nested_state)
+#define KVM_SET_NESTED_STATE         _IOW(KVMIO,  0xbf, struct kvm_nested_state)
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
-- 
cgit v1.2.3


From 4180bf1b655a791a0a6ef93a2ffffc762722c782 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Mon, 23 Jul 2018 14:39:54 +0800
Subject: KVM: X86: Implement "send IPI" hypercall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using hypercall to send IPIs by one vmexit instead of one by one for
xAPIC/x2APIC physical mode and one vmexit per-cluster for x2APIC cluster
mode. Intel guest can enter x2apic cluster mode when interrupt remmaping
is enabled in qemu, however, latest AMD EPYC still just supports xapic
mode which can get great improvement by Exit-less IPIs. This patchset
lets a guest send multicast IPIs, with at most 128 destinations per
hypercall in 64-bit mode and 64 vCPUs per hypercall in 32-bit mode.

Hardware: Xeon Skylake 2.5GHz, 2 sockets, 40 cores, 80 threads, the VM
is 80 vCPUs, IPI microbenchmark(https://lkml.org/lkml/2017/12/19/141):

x2apic cluster mode, vanilla

 Dry-run:                         0,            2392199 ns
 Self-IPI:                  6907514,           15027589 ns
 Normal IPI:              223910476,          251301666 ns
 Broadcast IPI:                   0,         9282161150 ns
 Broadcast lock:                  0,         8812934104 ns

x2apic cluster mode, pv-ipi

 Dry-run:                         0,            2449341 ns
 Self-IPI:                  6720360,           15028732 ns
 Normal IPI:              228643307,          255708477 ns
 Broadcast IPI:                   0,         7572293590 ns  => 22% performance boost
 Broadcast lock:                  0,         8316124651 ns

x2apic physical mode, vanilla

 Dry-run:                         0,            3135933 ns
 Self-IPI:                  8572670,           17901757 ns
 Normal IPI:              226444334,          255421709 ns
 Broadcast IPI:                   0,        19845070887 ns
 Broadcast lock:                  0,        19827383656 ns

x2apic physical mode, pv-ipi

 Dry-run:                         0,            2446381 ns
 Self-IPI:                  6788217,           15021056 ns
 Normal IPI:              219454441,          249583458 ns
 Broadcast IPI:                   0,         7806540019 ns  => 154% performance boost
 Broadcast lock:                  0,         9143618799 ns

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/cpuid.txt      |  4 ++++
 Documentation/virtual/kvm/hypercalls.txt | 20 ++++++++++++++++
 arch/x86/include/asm/kvm_host.h          |  4 ++++
 arch/x86/kvm/cpuid.c                     |  3 ++-
 arch/x86/kvm/lapic.c                     | 40 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c                       |  3 +++
 include/uapi/linux/kvm_para.h            |  1 +
 7 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index ab022dcd0911..97ca1940a0dc 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -62,6 +62,10 @@ KVM_FEATURE_ASYNC_PF_VMEXIT        ||    10 || paravirtualized async PF VM exit
                                    ||       || can be enabled by setting bit 2
                                    ||       || when writing to msr 0x4b564d02
 ------------------------------------------------------------------------------
+KVM_FEATURE_PV_SEND_IPI            ||    11 || guest checks this feature bit
+                                   ||       || before using paravirtualized
+                                   ||       || send IPIs.
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                    ||       || per-cpu warps are expected in
                                    ||       || kvmclock.
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index a890529c63ed..da24c138c8d1 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -121,3 +121,23 @@ compute the CLOCK_REALTIME for its clock, at the same instant.
 
 Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource,
 or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK.
+
+6. KVM_HC_SEND_IPI
+------------------------
+Architecture: x86
+Status: active
+Purpose: Send IPIs to multiple vCPUs.
+
+a0: lower part of the bitmap of destination APIC IDs
+a1: higher part of the bitmap of destination APIC IDs
+a2: the lowest APIC ID in bitmap
+a3: APIC ICR
+
+The hypercall lets a guest send multicast IPIs, with at most 128
+128 destinations per hypercall in 64-bit mode and 64 vCPUs per
+hypercall in 32-bit mode.  The destinations are represented by a
+bitmap contained in the first two arguments (a0 and a1). Bit 0 of
+a0 corresponds to the APIC ID in the third argument (a2), bit 1
+corresponds to the APIC ID a2+1, and so on.
+
+Returns the number of CPUs to which the IPIs were delivered successfully.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 150937e64f63..c18958ef17d2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1457,6 +1457,10 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
 
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+    		    unsigned long ipi_bitmap_high, int min,
+		    unsigned long icr, int op_64_bit);
+
 void kvm_define_shared_msr(unsigned index, u32 msr);
 int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7e042e3d47fd..7bcfa61375c0 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -621,7 +621,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
 			     (1 << KVM_FEATURE_PV_UNHALT) |
 			     (1 << KVM_FEATURE_PV_TLB_FLUSH) |
-			     (1 << KVM_FEATURE_ASYNC_PF_VMEXIT);
+			     (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
+			     (1 << KVM_FEATURE_PV_SEND_IPI);
 
 		if (sched_info_on())
 			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b5cd8465d44f..f0d693122c24 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -547,6 +547,46 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 			irq->level, irq->trig_mode, dest_map);
 }
 
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+    		    unsigned long ipi_bitmap_high, int min,
+		    unsigned long icr, int op_64_bit)
+{
+	int i;
+	struct kvm_apic_map *map;
+	struct kvm_vcpu *vcpu;
+	struct kvm_lapic_irq irq = {0};
+	int cluster_size = op_64_bit ? 64 : 32;
+	int count = 0;
+
+	irq.vector = icr & APIC_VECTOR_MASK;
+	irq.delivery_mode = icr & APIC_MODE_MASK;
+	irq.level = (icr & APIC_INT_ASSERT) != 0;
+	irq.trig_mode = icr & APIC_INT_LEVELTRIG;
+
+	if (icr & APIC_DEST_MASK)
+		return -KVM_EINVAL;
+	if (icr & APIC_SHORT_MASK)
+		return -KVM_EINVAL;
+
+	rcu_read_lock();
+	map = rcu_dereference(kvm->arch.apic_map);
+
+	/* Bits above cluster_size are masked in the caller.  */
+	for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) {
+		vcpu = map->phys_map[min + i]->vcpu;
+		count += kvm_apic_set_irq(vcpu, &irq, NULL);
+	}
+
+	min += cluster_size;
+	for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) {
+		vcpu = map->phys_map[min + i]->vcpu;
+		count += kvm_apic_set_irq(vcpu, &irq, NULL);
+	}
+
+	rcu_read_unlock();
+	return count;
+}
+
 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
 {
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b974802cadb..3c83711c0ebe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6802,6 +6802,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	case KVM_HC_CLOCK_PAIRING:
 		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
 		break;
+	case KVM_HC_SEND_IPI:
+		ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
+		break;
 #endif
 	default:
 		ret = -KVM_ENOSYS;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index dcf629dd2889..f3893ef82b65 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -13,6 +13,7 @@
 /* Return values for hypercalls */
 #define KVM_ENOSYS		1000
 #define KVM_EFAULT		EFAULT
+#define KVM_EINVAL		EINVAL
 #define KVM_E2BIG		E2BIG
 #define KVM_EPERM		EPERM
 #define KVM_EOPNOTSUPP		95
-- 
cgit v1.2.3


From aaffcfd1e82d3378538408d0310b7424b98d8f81 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Mon, 23 Jul 2018 14:39:52 +0800
Subject: KVM: X86: Implement PV IPIs in linux guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement paravirtual apic hooks to enable PV IPIs for KVM if the "send IPI"
hypercall is available.  The hypercall lets a guest send IPIs, with
at most 128 destinations per hypercall in 64-bit mode and 64 vCPUs per
hypercall in 32-bit mode.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/kvm_para.h |  1 +
 arch/x86/kernel/kvm.c                | 96 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm_para.h        |  1 +
 3 files changed, 98 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 0ede697c3961..19980ec1a316 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -28,6 +28,7 @@
 #define KVM_FEATURE_PV_UNHALT		7
 #define KVM_FEATURE_PV_TLB_FLUSH	9
 #define KVM_FEATURE_ASYNC_PF_VMEXIT	10
+#define KVM_FEATURE_PV_SEND_IPI	11
 
 #define KVM_HINTS_REALTIME      0
 
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 39d79720380f..62cbd089f709 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -454,6 +454,98 @@ static void __init sev_map_percpu_data(void)
 }
 
 #ifdef CONFIG_SMP
+#define KVM_IPI_CLUSTER_SIZE	(2 * BITS_PER_LONG)
+
+static void __send_ipi_mask(const struct cpumask *mask, int vector)
+{
+	unsigned long flags;
+	int cpu, apic_id, icr;
+	int min = 0, max = 0;
+#ifdef CONFIG_X86_64
+	__uint128_t ipi_bitmap = 0;
+#else
+	u64 ipi_bitmap = 0;
+#endif
+
+	if (cpumask_empty(mask))
+		return;
+
+	local_irq_save(flags);
+
+	switch (vector) {
+	default:
+		icr = APIC_DM_FIXED | vector;
+		break;
+	case NMI_VECTOR:
+		icr = APIC_DM_NMI;
+		break;
+	}
+
+	for_each_cpu(cpu, mask) {
+		apic_id = per_cpu(x86_cpu_to_apicid, cpu);
+		if (!ipi_bitmap) {
+			min = max = apic_id;
+		} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
+			ipi_bitmap <<= min - apic_id;
+			min = apic_id;
+		} else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
+			max = apic_id < max ? max : apic_id;
+		} else {
+			kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+				(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+			min = max = apic_id;
+			ipi_bitmap = 0;
+		}
+		__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
+	}
+
+	if (ipi_bitmap) {
+		kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+			(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+	}
+
+	local_irq_restore(flags);
+}
+
+static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+	__send_ipi_mask(mask, vector);
+}
+
+static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+	unsigned int this_cpu = smp_processor_id();
+	struct cpumask new_mask;
+	const struct cpumask *local_mask;
+
+	cpumask_copy(&new_mask, mask);
+	cpumask_clear_cpu(this_cpu, &new_mask);
+	local_mask = &new_mask;
+	__send_ipi_mask(local_mask, vector);
+}
+
+static void kvm_send_ipi_allbutself(int vector)
+{
+	kvm_send_ipi_mask_allbutself(cpu_online_mask, vector);
+}
+
+static void kvm_send_ipi_all(int vector)
+{
+	__send_ipi_mask(cpu_online_mask, vector);
+}
+
+/*
+ * Set the IPI entry points
+ */
+static void kvm_setup_pv_ipi(void)
+{
+	apic->send_IPI_mask = kvm_send_ipi_mask;
+	apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
+	apic->send_IPI_allbutself = kvm_send_ipi_allbutself;
+	apic->send_IPI_all = kvm_send_ipi_all;
+	pr_info("KVM setup pv IPIs\n");
+}
+
 static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
 {
 	native_smp_prepare_cpus(max_cpus);
@@ -626,6 +718,10 @@ static uint32_t __init kvm_detect(void)
 
 static void __init kvm_apic_init(void)
 {
+#if defined(CONFIG_SMP)
+	if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI))
+		kvm_setup_pv_ipi();
+#endif
 }
 
 static void __init kvm_init_platform(void)
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index f3893ef82b65..6c0ce49931e5 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -27,6 +27,7 @@
 #define KVM_HC_MIPS_EXIT_VM		7
 #define KVM_HC_MIPS_CONSOLE_OUTPUT	8
 #define KVM_HC_CLOCK_PAIRING		9
+#define KVM_HC_SEND_IPI		10
 
 /*
  * hypercalls use architecture specific
-- 
cgit v1.2.3


From 429711aec282c4b5fe5bbd7b2f0bbbff4110ffb2 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Mon, 6 Aug 2018 11:17:47 +0800
Subject: vhost: switch to use new message format

We use to have message like:

struct vhost_msg {
	int type;
	union {
		struct vhost_iotlb_msg iotlb;
		__u8 padding[64];
	};
};

Unfortunately, there will be a hole of 32bit in 64bit machine because
of the alignment. This leads a different formats between 32bit API and
64bit API. What's more it will break 32bit program running on 64bit
machine.

So fixing this by introducing a new message type with an explicit
32bit reserved field after type like:

struct vhost_msg_v2 {
	__u32 type;
	__u32 reserved;
	union {
		struct vhost_iotlb_msg iotlb;
		__u8 padding[64];
	};
};

We will have a consistent ABI after switching to use this. To enable
this capability, introduce a new ioctl (VHOST_SET_BAKCEND_FEATURE) for
userspace to enable this feature (VHOST_BACKEND_F_IOTLB_V2).

Fixes: 6b1e6cc7855b ("vhost: new device IOTLB API")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/net.c        | 30 ++++++++++++++++++++
 drivers/vhost/vhost.c      | 71 ++++++++++++++++++++++++++++++++++------------
 drivers/vhost/vhost.h      | 11 ++++++-
 include/uapi/linux/vhost.h | 18 ++++++++++++
 4 files changed, 111 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 367d8023b54d..4e656f89cb22 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -77,6 +77,10 @@ enum {
 			 (1ULL << VIRTIO_F_IOMMU_PLATFORM)
 };
 
+enum {
+	VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
+};
+
 enum {
 	VHOST_NET_VQ_RX = 0,
 	VHOST_NET_VQ_TX = 1,
@@ -1399,6 +1403,21 @@ done:
 	return err;
 }
 
+static int vhost_net_set_backend_features(struct vhost_net *n, u64 features)
+{
+	int i;
+
+	mutex_lock(&n->dev.mutex);
+	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
+		mutex_lock(&n->vqs[i].vq.mutex);
+		n->vqs[i].vq.acked_backend_features = features;
+		mutex_unlock(&n->vqs[i].vq.mutex);
+	}
+	mutex_unlock(&n->dev.mutex);
+
+	return 0;
+}
+
 static int vhost_net_set_features(struct vhost_net *n, u64 features)
 {
 	size_t vhost_hlen, sock_hlen, hdr_len;
@@ -1489,6 +1508,17 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
 		if (features & ~VHOST_NET_FEATURES)
 			return -EOPNOTSUPP;
 		return vhost_net_set_features(n, features);
+	case VHOST_GET_BACKEND_FEATURES:
+		features = VHOST_NET_BACKEND_FEATURES;
+		if (copy_to_user(featurep, &features, sizeof(features)))
+			return -EFAULT;
+		return 0;
+	case VHOST_SET_BACKEND_FEATURES:
+		if (copy_from_user(&features, featurep, sizeof(features)))
+			return -EFAULT;
+		if (features & ~VHOST_NET_BACKEND_FEATURES)
+			return -EOPNOTSUPP;
+		return vhost_net_set_backend_features(n, features);
 	case VHOST_RESET_OWNER:
 		return vhost_net_reset_owner(n);
 	case VHOST_SET_OWNER:
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a502f1af4a21..6f6c42d5e4be 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -315,6 +315,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->log_addr = -1ull;
 	vq->private_data = NULL;
 	vq->acked_features = 0;
+	vq->acked_backend_features = 0;
 	vq->log_base = NULL;
 	vq->error_ctx = NULL;
 	vq->kick = NULL;
@@ -1027,28 +1028,40 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
 ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
 			     struct iov_iter *from)
 {
-	struct vhost_msg_node node;
-	unsigned size = sizeof(struct vhost_msg);
-	size_t ret;
-	int err;
+	struct vhost_iotlb_msg msg;
+	size_t offset;
+	int type, ret;
 
-	if (iov_iter_count(from) < size)
-		return 0;
-	ret = copy_from_iter(&node.msg, size, from);
-	if (ret != size)
+	ret = copy_from_iter(&type, sizeof(type), from);
+	if (ret != sizeof(type))
 		goto done;
 
-	switch (node.msg.type) {
+	switch (type) {
 	case VHOST_IOTLB_MSG:
-		err = vhost_process_iotlb_msg(dev, &node.msg.iotlb);
-		if (err)
-			ret = err;
+		/* There maybe a hole after type for V1 message type,
+		 * so skip it here.
+		 */
+		offset = offsetof(struct vhost_msg, iotlb) - sizeof(int);
+		break;
+	case VHOST_IOTLB_MSG_V2:
+		offset = sizeof(__u32);
 		break;
 	default:
 		ret = -EINVAL;
-		break;
+		goto done;
+	}
+
+	iov_iter_advance(from, offset);
+	ret = copy_from_iter(&msg, sizeof(msg), from);
+	if (ret != sizeof(msg))
+		goto done;
+	if (vhost_process_iotlb_msg(dev, &msg)) {
+		ret = -EFAULT;
+		goto done;
 	}
 
+	ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) :
+	      sizeof(struct vhost_msg_v2);
 done:
 	return ret;
 }
@@ -1107,13 +1120,28 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
 		finish_wait(&dev->wait, &wait);
 
 	if (node) {
-		ret = copy_to_iter(&node->msg, size, to);
+		struct vhost_iotlb_msg *msg;
+		void *start = &node->msg;
+
+		switch (node->msg.type) {
+		case VHOST_IOTLB_MSG:
+			size = sizeof(node->msg);
+			msg = &node->msg.iotlb;
+			break;
+		case VHOST_IOTLB_MSG_V2:
+			size = sizeof(node->msg_v2);
+			msg = &node->msg_v2.iotlb;
+			break;
+		default:
+			BUG();
+			break;
+		}
 
-		if (ret != size || node->msg.type != VHOST_IOTLB_MISS) {
+		ret = copy_to_iter(start, size, to);
+		if (ret != size || msg->type != VHOST_IOTLB_MISS) {
 			kfree(node);
 			return ret;
 		}
-
 		vhost_enqueue_msg(dev, &dev->pending_list, node);
 	}
 
@@ -1126,12 +1154,19 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
 	struct vhost_dev *dev = vq->dev;
 	struct vhost_msg_node *node;
 	struct vhost_iotlb_msg *msg;
+	bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2);
 
-	node = vhost_new_msg(vq, VHOST_IOTLB_MISS);
+	node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG);
 	if (!node)
 		return -ENOMEM;
 
-	msg = &node->msg.iotlb;
+	if (v2) {
+		node->msg_v2.type = VHOST_IOTLB_MSG_V2;
+		msg = &node->msg_v2.iotlb;
+	} else {
+		msg = &node->msg.iotlb;
+	}
+
 	msg->type = VHOST_IOTLB_MISS;
 	msg->iova = iova;
 	msg->perm = access;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6c844b90a168..466ef7542291 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -132,6 +132,7 @@ struct vhost_virtqueue {
 	struct vhost_umem *iotlb;
 	void *private_data;
 	u64 acked_features;
+	u64 acked_backend_features;
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
@@ -147,7 +148,10 @@ struct vhost_virtqueue {
 };
 
 struct vhost_msg_node {
-  struct vhost_msg msg;
+  union {
+	  struct vhost_msg msg;
+	  struct vhost_msg_v2 msg_v2;
+  };
   struct vhost_virtqueue *vq;
   struct list_head node;
 };
@@ -238,6 +242,11 @@ static inline bool vhost_has_feature(struct vhost_virtqueue *vq, int bit)
 	return vq->acked_features & (1ULL << bit);
 }
 
+static inline bool vhost_backend_has_feature(struct vhost_virtqueue *vq, int bit)
+{
+	return vq->acked_backend_features & (1ULL << bit);
+}
+
 #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
 static inline bool vhost_is_little_endian(struct vhost_virtqueue *vq)
 {
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index c51f8e5cc608..b1e22c40c4b6 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -65,6 +65,7 @@ struct vhost_iotlb_msg {
 };
 
 #define VHOST_IOTLB_MSG 0x1
+#define VHOST_IOTLB_MSG_V2 0x2
 
 struct vhost_msg {
 	int type;
@@ -74,6 +75,15 @@ struct vhost_msg {
 	};
 };
 
+struct vhost_msg_v2 {
+	__u32 type;
+	__u32 reserved;
+	union {
+		struct vhost_iotlb_msg iotlb;
+		__u8 padding[64];
+	};
+};
+
 struct vhost_memory_region {
 	__u64 guest_phys_addr;
 	__u64 memory_size; /* bytes */
@@ -160,6 +170,14 @@ struct vhost_memory {
 #define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24,	\
 					 struct vhost_vring_state)
 
+/* Set or get vhost backend capability */
+
+/* Use message type V2 */
+#define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1
+
+#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64)
+#define VHOST_GET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x26, __u64)
+
 /* VHOST_NET specific defines */
 
 /* Attach virtio net ring to a raw socket, or tap device.
-- 
cgit v1.2.3


From 35a8a3bd1c2e29bb6baec501c6f56abaaa10a48a Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Tue, 7 Aug 2018 11:43:02 +0200
Subject: netfilter: nft_osf: use NFT_OSF_MAXGENRELEN instead of IFNAMSIZ

As no "genre" on pf.os exceed 16 bytes of length, we reduce
NFT_OSF_MAXGENRELEN parameter to 16 bytes and use it instead of IFNAMSIZ.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 1 +
 net/netfilter/nft_osf.c                  | 8 +++-----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 357862d948de..94657c701f22 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -8,6 +8,7 @@
 #define NFT_SET_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_OBJ_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_USERDATA_MAXLEN	256
+#define NFT_OSF_MAXGENRELEN	16
 
 /**
  * enum nft_registers - nf_tables registers
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index 9b2f3de7be4f..5af74b37f423 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -4,8 +4,6 @@
 #include <net/netfilter/nf_tables.h>
 #include <linux/netfilter/nfnetlink_osf.h>
 
-#define OSF_GENRE_SIZE		32
-
 struct nft_osf {
 	enum nft_registers	dreg:8;
 };
@@ -37,9 +35,9 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
 
 	os_name = nf_osf_find(skb, nf_osf_fingers);
 	if (!os_name)
-		strncpy((char *)dest, "unknown", IFNAMSIZ);
+		strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
 	else
-		strncpy((char *)dest, os_name, IFNAMSIZ);
+		strncpy((char *)dest, os_name, NFT_OSF_MAXGENRELEN);
 }
 
 static int nft_osf_init(const struct nft_ctx *ctx,
@@ -51,7 +49,7 @@ static int nft_osf_init(const struct nft_ctx *ctx,
 
 	priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
 	err = nft_validate_register_store(ctx, priv->dreg, NULL,
-					  NFTA_DATA_VALUE, OSF_GENRE_SIZE);
+					  NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3


From 7e0b2b57f01d183e1c84114f1f2287737358d748 Mon Sep 17 00:00:00 2001
From: Harsha Sharma <harshasharmaiitr@gmail.com>
Date: Tue, 7 Aug 2018 17:14:23 +0200
Subject: netfilter: nft_ct: add ct timeout support

This patch allows to add, list and delete connection tracking timeout
policies via nft objref infrastructure and assigning these timeout
via nft rule.

%./libnftnl/examples/nft-ct-timeout-add ip raw cttime tcp

Ruleset:

table ip raw {
   ct timeout cttime {
       protocol tcp;
       policy = {established: 111, close: 13 }
   }

   chain output {
       type filter hook output priority -300; policy accept;
       ct timeout set "cttime"
   }
}

%./libnftnl/examples/nft-rule-ct-timeout-add ip raw output cttime

%conntrack -E
[NEW] tcp      6 111 ESTABLISHED src=172.16.19.128 dst=172.16.19.1
sport=22 dport=41360 [UNREPLIED] src=172.16.19.1 dst=172.16.19.128
sport=41360 dport=22

%nft delete rule ip raw output handle <handle>
%./libnftnl/examples/nft-ct-timeout-del ip raw cttime

Joint work with Pablo Neira.

Signed-off-by: Harsha Sharma <harshasharmaiitr@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  14 ++-
 net/netfilter/nft_ct.c                   | 204 ++++++++++++++++++++++++++++++-
 2 files changed, 216 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 94657c701f22..e23290ffdc77 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -958,6 +958,7 @@ enum nft_socket_keys {
  * @NFT_CT_DST_IP: conntrack layer 3 protocol destination (IPv4 address)
  * @NFT_CT_SRC_IP6: conntrack layer 3 protocol source (IPv6 address)
  * @NFT_CT_DST_IP6: conntrack layer 3 protocol destination (IPv6 address)
+ * @NFT_CT_TIMEOUT: connection tracking timeout policy assigned to conntrack
  */
 enum nft_ct_keys {
 	NFT_CT_STATE,
@@ -983,6 +984,7 @@ enum nft_ct_keys {
 	NFT_CT_DST_IP,
 	NFT_CT_SRC_IP6,
 	NFT_CT_DST_IP6,
+	NFT_CT_TIMEOUT,
 	__NFT_CT_MAX
 };
 #define NFT_CT_MAX		(__NFT_CT_MAX - 1)
@@ -1411,6 +1413,15 @@ enum nft_ct_helper_attributes {
 };
 #define NFTA_CT_HELPER_MAX	(__NFTA_CT_HELPER_MAX - 1)
 
+enum nft_ct_timeout_timeout_attributes {
+	NFTA_CT_TIMEOUT_UNSPEC,
+	NFTA_CT_TIMEOUT_L3PROTO,
+	NFTA_CT_TIMEOUT_L4PROTO,
+	NFTA_CT_TIMEOUT_DATA,
+	__NFTA_CT_TIMEOUT_MAX,
+};
+#define NFTA_CT_TIMEOUT_MAX	(__NFTA_CT_TIMEOUT_MAX - 1)
+
 #define NFT_OBJECT_UNSPEC	0
 #define NFT_OBJECT_COUNTER	1
 #define NFT_OBJECT_QUOTA	2
@@ -1418,7 +1429,8 @@ enum nft_ct_helper_attributes {
 #define NFT_OBJECT_LIMIT	4
 #define NFT_OBJECT_CONNLIMIT	5
 #define NFT_OBJECT_TUNNEL	6
-#define __NFT_OBJECT_MAX	7
+#define NFT_OBJECT_CT_TIMEOUT	7
+#define __NFT_OBJECT_MAX	8
 #define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 3bc82ee5464d..4788458a0931 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -22,6 +22,8 @@
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
 
 struct nft_ct {
 	enum nft_ct_keys	key:8;
@@ -765,6 +767,194 @@ static struct nft_expr_type nft_notrack_type __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+static int
+nft_ct_timeout_parse_policy(void *timeouts,
+			    const struct nf_conntrack_l4proto *l4proto,
+			    struct net *net, const struct nlattr *attr)
+{
+	struct nlattr **tb;
+	int ret = 0;
+
+	if (!l4proto->ctnl_timeout.nlattr_to_obj)
+		return 0;
+
+	tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb),
+		     GFP_KERNEL);
+
+	if (!tb)
+		return -ENOMEM;
+
+	ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max,
+			       attr, l4proto->ctnl_timeout.nla_policy,
+			       NULL);
+	if (ret < 0)
+		goto err;
+
+	ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
+
+err:
+	kfree(tb);
+	return ret;
+}
+
+struct nft_ct_timeout_obj {
+	struct nf_conn		*tmpl;
+	u8			l4proto;
+};
+
+static void nft_ct_timeout_obj_eval(struct nft_object *obj,
+				    struct nft_regs *regs,
+				    const struct nft_pktinfo *pkt)
+{
+	const struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
+	struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb);
+	struct sk_buff *skb = pkt->skb;
+
+	if (ct ||
+	    priv->l4proto != pkt->tprot)
+		return;
+
+	nf_ct_set(skb, priv->tmpl, IP_CT_NEW);
+}
+
+static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
+				   const struct nlattr * const tb[],
+				   struct nft_object *obj)
+{
+	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+	struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
+	const struct nf_conntrack_l4proto *l4proto;
+	struct nf_conn_timeout *timeout_ext;
+	struct nf_ct_timeout *timeout;
+	int l3num = ctx->family;
+	struct nf_conn *tmpl;
+	__u8 l4num;
+	int ret;
+
+	if (!tb[NFTA_CT_TIMEOUT_L3PROTO] ||
+	    !tb[NFTA_CT_TIMEOUT_L4PROTO] ||
+	    !tb[NFTA_CT_TIMEOUT_DATA])
+		return -EINVAL;
+
+	l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO]));
+	l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]);
+	priv->l4proto = l4num;
+
+	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+	if (l4proto->l4proto != l4num) {
+		ret = -EOPNOTSUPP;
+		goto err_proto_put;
+	}
+
+	timeout = kzalloc(sizeof(struct nf_ct_timeout) +
+			  l4proto->ctnl_timeout.obj_size, GFP_KERNEL);
+	if (timeout == NULL) {
+		ret = -ENOMEM;
+		goto err_proto_put;
+	}
+
+	ret = nft_ct_timeout_parse_policy(&timeout->data, l4proto, ctx->net,
+					  tb[NFTA_CT_TIMEOUT_DATA]);
+	if (ret < 0)
+		goto err_free_timeout;
+
+	timeout->l3num = l3num;
+	timeout->l4proto = l4proto;
+	tmpl = nf_ct_tmpl_alloc(ctx->net, zone, GFP_ATOMIC);
+	if (!tmpl) {
+		ret = -ENOMEM;
+		goto err_free_timeout;
+	}
+
+	timeout_ext = nf_ct_timeout_ext_add(tmpl, timeout, GFP_ATOMIC);
+	if (!timeout_ext) {
+		ret = -ENOMEM;
+		goto err_free_tmpl;
+	}
+
+	ret = nf_ct_netns_get(ctx->net, ctx->family);
+	if (ret < 0)
+		goto err_free_tmpl;
+
+	priv->tmpl = tmpl;
+
+	return 0;
+
+err_free_tmpl:
+	nf_ct_tmpl_free(tmpl);
+err_free_timeout:
+	kfree(timeout);
+err_proto_put:
+	nf_ct_l4proto_put(l4proto);
+	return ret;
+}
+
+static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx,
+				       struct nft_object *obj)
+{
+	struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
+	struct nf_conn_timeout *t = nf_ct_timeout_find(priv->tmpl);
+	struct nf_ct_timeout *timeout;
+
+	timeout = rcu_dereference_raw(t->timeout);
+	nf_ct_untimeout(ctx->net, timeout);
+	nf_ct_l4proto_put(timeout->l4proto);
+	nf_ct_netns_put(ctx->net, ctx->family);
+	nf_ct_tmpl_free(priv->tmpl);
+}
+
+static int nft_ct_timeout_obj_dump(struct sk_buff *skb,
+				   struct nft_object *obj, bool reset)
+{
+	const struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
+	const struct nf_conn_timeout *t = nf_ct_timeout_find(priv->tmpl);
+	const struct nf_ct_timeout *timeout = rcu_dereference_raw(t->timeout);
+	struct nlattr *nest_params;
+	int ret;
+
+	if (nla_put_u8(skb, NFTA_CT_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) ||
+	    nla_put_be16(skb, NFTA_CT_TIMEOUT_L3PROTO, htons(timeout->l3num)))
+		return -1;
+
+	nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA | NLA_F_NESTED);
+	if (!nest_params)
+		return -1;
+
+	ret = timeout->l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data);
+	if (ret < 0)
+		return -1;
+	nla_nest_end(skb, nest_params);
+	return 0;
+}
+
+static const struct nla_policy nft_ct_timeout_policy[NFTA_CT_TIMEOUT_MAX + 1] = {
+	[NFTA_CT_TIMEOUT_L3PROTO] = {.type = NLA_U16 },
+	[NFTA_CT_TIMEOUT_L4PROTO] = {.type = NLA_U8 },
+	[NFTA_CT_TIMEOUT_DATA]	  = {.type = NLA_NESTED },
+};
+
+static struct nft_object_type nft_ct_timeout_obj_type;
+
+static const struct nft_object_ops nft_ct_timeout_obj_ops = {
+	.type		= &nft_ct_timeout_obj_type,
+	.size		= sizeof(struct nft_ct_timeout_obj),
+	.eval		= nft_ct_timeout_obj_eval,
+	.init		= nft_ct_timeout_obj_init,
+	.destroy	= nft_ct_timeout_obj_destroy,
+	.dump		= nft_ct_timeout_obj_dump,
+};
+
+static struct nft_object_type nft_ct_timeout_obj_type __read_mostly = {
+	.type		= NFT_OBJECT_CT_TIMEOUT,
+	.ops		= &nft_ct_timeout_obj_ops,
+	.maxattr	= NFTA_CT_TIMEOUT_MAX,
+	.policy		= nft_ct_timeout_policy,
+	.owner		= THIS_MODULE,
+};
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+
 static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
 				  const struct nlattr * const tb[],
 				  struct nft_object *obj)
@@ -949,9 +1139,17 @@ static int __init nft_ct_module_init(void)
 	err = nft_register_obj(&nft_ct_helper_obj_type);
 	if (err < 0)
 		goto err2;
-
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+	err = nft_register_obj(&nft_ct_timeout_obj_type);
+	if (err < 0)
+		goto err3;
+#endif
 	return 0;
 
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+err3:
+	nft_unregister_obj(&nft_ct_helper_obj_type);
+#endif
 err2:
 	nft_unregister_expr(&nft_notrack_type);
 err1:
@@ -961,6 +1159,9 @@ err1:
 
 static void __exit nft_ct_module_exit(void)
 {
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+	nft_unregister_obj(&nft_ct_timeout_obj_type);
+#endif
 	nft_unregister_obj(&nft_ct_helper_obj_type);
 	nft_unregister_expr(&nft_notrack_type);
 	nft_unregister_expr(&nft_ct_type);
@@ -974,3 +1175,4 @@ MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_ALIAS_NFT_EXPR("ct");
 MODULE_ALIAS_NFT_EXPR("notrack");
 MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER);
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT);
-- 
cgit v1.2.3


From 6cfef793b558eee47bac720574aff0d36b89d20a Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 7 Aug 2018 10:50:20 -0700
Subject: ethtool: Add WAKE_FILTER and RX_CLS_FLOW_WAKE

Add the ability to specify through ethtool::rxnfc that a rule location is
special and will be used to participate in Wake-on-LAN, by e.g: having a
specific pattern be matched. When this is the case, fs->ring_cookie must
be set to the special value RX_CLS_FLOW_WAKE.

We also define an additional ethtool::wolinfo flag: WAKE_FILTER which
can be used to configure an Ethernet adapter to allow Wake-on-LAN using
previously programmed filters.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 813282cc8af6..dc69391d2bba 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -870,7 +870,8 @@ struct ethtool_flow_ext {
  *	includes the %FLOW_EXT or %FLOW_MAC_EXT flag
  *	(see &struct ethtool_flow_ext description).
  * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC
- *	if packets should be discarded
+ *	if packets should be discarded, or %RX_CLS_FLOW_WAKE if the
+ *	packets should be used for Wake-on-LAN with %WAKE_FILTER
  * @location: Location of rule in the table.  Locations must be
  *	numbered such that a flow matching multiple rules will be
  *	classified according to the first (lowest numbered) rule.
@@ -1634,6 +1635,7 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define WAKE_ARP		(1 << 4)
 #define WAKE_MAGIC		(1 << 5)
 #define WAKE_MAGICSECURE	(1 << 6) /* only meaningful if WAKE_MAGIC */
+#define WAKE_FILTER		(1 << 7)
 
 /* L2-L4 network traffic flow types */
 #define	TCP_V4_FLOW	0x01	/* hash or spec (tcp_ip4_spec) */
@@ -1671,6 +1673,7 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define	RXH_DISCARD	(1 << 31)
 
 #define	RX_CLS_FLOW_DISC	0xffffffffffffffffULL
+#define RX_CLS_FLOW_WAKE	0xfffffffffffffffeULL
 
 /* Special RX classification rule insert location values */
 #define RX_CLS_LOC_SPECIAL	0x80000000	/* flag */
-- 
cgit v1.2.3


From 0a6e77784f490912d81b92cfd48424541c04691e Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Tue, 7 Aug 2018 17:36:01 +0200
Subject: net/sched: allow flower to match tunnel options

Allow matching on options in Geneve tunnel headers.
This makes use of existing tunnel metadata support.

The options can be described in the form
CLASS:TYPE:DATA/CLASS_MASK:TYPE_MASK:DATA_MASK, where CLASS is
represented as a 16bit hexadecimal value, TYPE as an 8bit
hexadecimal value and DATA as a variable length hexadecimal value.

e.g.
 # ip link add name geneve0 type geneve dstport 0 external
 # tc qdisc add dev geneve0 ingress
 # tc filter add dev geneve0 protocol ip parent ffff: \
     flower \
       enc_src_ip 10.0.99.192 \
       enc_dst_ip 10.0.99.193 \
       enc_key_id 11 \
       geneve_opts 0102:80:1122334421314151/ffff:ff:ffffffffffffffff \
       ip_proto udp \
       action mirred egress redirect dev eth1

This patch adds support for matching Geneve options in the order
supplied by the user. This leads to an efficient implementation in
the software datapath (and in our opinion hardware datapaths that
offload this feature). It is also compatible with Geneve options
matching provided by the Open vSwitch kernel datapath which is
relevant here as the Flower classifier may be used as a mechanism
to program flows into hardware as a form of Open vSwitch datapath
offload (sometimes referred to as OVS-TC). The netlink
Kernel/Userspace API may be extended, for example by adding a flag,
if other matching options are desired, for example matching given
options in any order. This would require an implementation in the
TC software datapath. And be done in a way that drivers that
facilitate offload of the Flower classifier can reject or accept
such flows based on hardware datapath capabilities.

This approach was discussed and agreed on at Netconf 2017 in Seoul.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  26 +++++
 net/sched/cls_flower.c       | 244 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 269 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 48e5b5d49a34..be382fb0592d 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -480,11 +480,37 @@ enum {
 	TCA_FLOWER_KEY_ENC_IP_TTL,	/* u8 */
 	TCA_FLOWER_KEY_ENC_IP_TTL_MASK,	/* u8 */
 
+	TCA_FLOWER_KEY_ENC_OPTS,
+	TCA_FLOWER_KEY_ENC_OPTS_MASK,
+
 	__TCA_FLOWER_MAX,
 };
 
 #define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
 
+enum {
+	TCA_FLOWER_KEY_ENC_OPTS_UNSPEC,
+	TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested
+					 * TCA_FLOWER_KEY_ENC_OPT_GENEVE_
+					 * attributes
+					 */
+	__TCA_FLOWER_KEY_ENC_OPTS_MAX,
+};
+
+#define TCA_FLOWER_KEY_ENC_OPTS_MAX (__TCA_FLOWER_KEY_ENC_OPTS_MAX - 1)
+
+enum {
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_UNSPEC,
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS,            /* u16 */
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE,             /* u8 */
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA,             /* 4 to 128 bytes */
+
+	__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX,
+};
+
+#define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \
+		(__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1)
+
 enum {
 	TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
 	TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index a3b69bb6f4b0..9da244235170 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -24,6 +24,7 @@
 #include <net/pkt_cls.h>
 #include <net/ip.h>
 #include <net/flow_dissector.h>
+#include <net/geneve.h>
 
 #include <net/dst.h>
 #include <net/dst_metadata.h>
@@ -53,6 +54,7 @@ struct fl_flow_key {
 	struct flow_dissector_key_tcp tcp;
 	struct flow_dissector_key_ip ip;
 	struct flow_dissector_key_ip enc_ip;
+	struct flow_dissector_key_enc_opts enc_opts;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -482,6 +484,21 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_ENC_IP_TTL]	 = { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ENC_OPTS]	= { .type = NLA_NESTED },
+	[TCA_FLOWER_KEY_ENC_OPTS_MASK]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
+	[TCA_FLOWER_KEY_ENC_OPTS_GENEVE]        = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = {
+	[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS]      = { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE]       = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA]       = { .type = NLA_BINARY,
+						       .len = 128 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -603,6 +620,145 @@ static void fl_set_key_ip(struct nlattr **tb, bool encap,
 	fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl));
 }
 
+static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
+			     int depth, int option_len,
+			     struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1];
+	struct nlattr *class = NULL, *type = NULL, *data = NULL;
+	struct geneve_opt *opt;
+	int err, data_len = 0;
+
+	if (option_len > sizeof(struct geneve_opt))
+		data_len = option_len - sizeof(struct geneve_opt);
+
+	opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len];
+	memset(opt, 0xff, option_len);
+	opt->length = data_len / 4;
+	opt->r1 = 0;
+	opt->r2 = 0;
+	opt->r3 = 0;
+
+	/* If no mask has been prodived we assume an exact match. */
+	if (!depth)
+		return sizeof(struct geneve_opt) + data_len;
+
+	if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_GENEVE) {
+		NL_SET_ERR_MSG(extack, "Non-geneve option type for mask");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX,
+			       nla, geneve_opt_policy, extack);
+	if (err < 0)
+		return err;
+
+	/* We are not allowed to omit any of CLASS, TYPE or DATA
+	 * fields from the key.
+	 */
+	if (!option_len &&
+	    (!tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] ||
+	     !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] ||
+	     !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA])) {
+		NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data");
+		return -EINVAL;
+	}
+
+	/* Omitting any of CLASS, TYPE or DATA fields is allowed
+	 * for the mask.
+	 */
+	if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA]) {
+		int new_len = key->enc_opts.len;
+
+		data = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA];
+		data_len = nla_len(data);
+		if (data_len < 4) {
+			NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long");
+			return -ERANGE;
+		}
+		if (data_len % 4) {
+			NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long");
+			return -ERANGE;
+		}
+
+		new_len += sizeof(struct geneve_opt) + data_len;
+		BUILD_BUG_ON(FLOW_DIS_TUN_OPTS_MAX != IP_TUNNEL_OPTS_MAX);
+		if (new_len > FLOW_DIS_TUN_OPTS_MAX) {
+			NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size");
+			return -ERANGE;
+		}
+		opt->length = data_len / 4;
+		memcpy(opt->opt_data, nla_data(data), data_len);
+	}
+
+	if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS]) {
+		class = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS];
+		opt->opt_class = nla_get_be16(class);
+	}
+
+	if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE]) {
+		type = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE];
+		opt->type = nla_get_u8(type);
+	}
+
+	return sizeof(struct geneve_opt) + data_len;
+}
+
+static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
+			  struct fl_flow_key *mask,
+			  struct netlink_ext_ack *extack)
+{
+	const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL;
+	int option_len, key_depth, msk_depth = 0;
+
+	nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]);
+
+	if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) {
+		nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
+		msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
+	}
+
+	nla_for_each_attr(nla_opt_key, nla_enc_key,
+			  nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) {
+		switch (nla_type(nla_opt_key)) {
+		case TCA_FLOWER_KEY_ENC_OPTS_GENEVE:
+			option_len = 0;
+			key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
+			option_len = fl_set_geneve_opt(nla_opt_key, key,
+						       key_depth, option_len,
+						       extack);
+			if (option_len < 0)
+				return option_len;
+
+			key->enc_opts.len += option_len;
+			/* At the same time we need to parse through the mask
+			 * in order to verify exact and mask attribute lengths.
+			 */
+			mask->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
+			option_len = fl_set_geneve_opt(nla_opt_msk, mask,
+						       msk_depth, option_len,
+						       extack);
+			if (option_len < 0)
+				return option_len;
+
+			mask->enc_opts.len += option_len;
+			if (key->enc_opts.len != mask->enc_opts.len) {
+				NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+				return -EINVAL;
+			}
+
+			if (msk_depth)
+				nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int fl_set_key(struct net *net, struct nlattr **tb,
 		      struct fl_flow_key *key, struct fl_flow_key *mask,
 		      struct netlink_ext_ack *extack)
@@ -799,6 +955,12 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 
 	fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip);
 
+	if (tb[TCA_FLOWER_KEY_ENC_OPTS]) {
+		ret = fl_set_enc_opt(tb, key, mask, extack);
+		if (ret)
+			return ret;
+	}
+
 	if (tb[TCA_FLOWER_KEY_FLAGS])
 		ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
 
@@ -894,6 +1056,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
 			     FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
+	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+			     FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts);
 
 	skb_flow_dissector_init(dissector, keys, cnt);
 }
@@ -1414,6 +1578,83 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
 	return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask);
 }
 
+static int fl_dump_key_geneve_opt(struct sk_buff *skb,
+				  struct flow_dissector_key_enc_opts *enc_opts)
+{
+	struct geneve_opt *opt;
+	struct nlattr *nest;
+	int opt_off = 0;
+
+	nest = nla_nest_start(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE);
+	if (!nest)
+		goto nla_put_failure;
+
+	while (enc_opts->len > opt_off) {
+		opt = (struct geneve_opt *)&enc_opts->data[opt_off];
+
+		if (nla_put_be16(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS,
+				 opt->opt_class))
+			goto nla_put_failure;
+		if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE,
+			       opt->type))
+			goto nla_put_failure;
+		if (nla_put(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA,
+			    opt->length * 4, opt->opt_data))
+			goto nla_put_failure;
+
+		opt_off += sizeof(struct geneve_opt) + opt->length * 4;
+	}
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
+			       struct flow_dissector_key_enc_opts *enc_opts)
+{
+	struct nlattr *nest;
+	int err;
+
+	if (!enc_opts->len)
+		return 0;
+
+	nest = nla_nest_start(skb, enc_opt_type);
+	if (!nest)
+		goto nla_put_failure;
+
+	switch (enc_opts->dst_opt_type) {
+	case TUNNEL_GENEVE_OPT:
+		err = fl_dump_key_geneve_opt(skb, enc_opts);
+		if (err)
+			goto nla_put_failure;
+		break;
+	default:
+		goto nla_put_failure;
+	}
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int fl_dump_key_enc_opt(struct sk_buff *skb,
+			       struct flow_dissector_key_enc_opts *key_opts,
+			       struct flow_dissector_key_enc_opts *msk_opts)
+{
+	int err;
+
+	err = fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS, key_opts);
+	if (err)
+		return err;
+
+	return fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS_MASK, msk_opts);
+}
+
 static int fl_dump_key(struct sk_buff *skb, struct net *net,
 		       struct fl_flow_key *key, struct fl_flow_key *mask)
 {
@@ -1594,7 +1835,8 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
 			    &mask->enc_tp.dst,
 			    TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
 			    sizeof(key->enc_tp.dst)) ||
-	    fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip))
+	    fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip) ||
+	    fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts))
 		goto nla_put_failure;
 
 	if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
-- 
cgit v1.2.3


From 212dfd909ea8b630e5d6fa4d25aeec9c4b4b14a5 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Tue, 7 Aug 2018 22:06:52 +0200
Subject: netfilter: nfnetlink_osf: add missing enum in nfnetlink_osf uapi
 header

xt_osf_window_size_options was originally part of
include/uapi/linux/netfilter/xt_osf.h, restore it.

Fixes: bfb15f2a95cb ("netfilter: extract Passive OS fingerprint infrastructure from xt_osf")
Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink_osf.h      | 12 ------------
 include/uapi/linux/netfilter/nfnetlink_osf.h | 12 ++++++++++++
 include/uapi/linux/netfilter/xt_osf.h        |  1 +
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h
index a7311bc03d3a..ecf7dab81e9e 100644
--- a/include/linux/netfilter/nfnetlink_osf.h
+++ b/include/linux/netfilter/nfnetlink_osf.h
@@ -4,18 +4,6 @@
 
 #include <uapi/linux/netfilter/nfnetlink_osf.h>
 
-/* Initial window size option state machine: multiple of mss, mtu or
- * plain numeric value. Can also be made as plain numeric value which
- * is not a multiple of specified value.
- */
-enum nf_osf_window_size_options {
-	OSF_WSS_PLAIN   = 0,
-	OSF_WSS_MSS,
-	OSF_WSS_MTU,
-	OSF_WSS_MODULO,
-	OSF_WSS_MAX,
-};
-
 enum osf_fmatch_states {
 	/* Packet does not match the fingerprint */
 	FMATCH_WRONG = 0,
diff --git a/include/uapi/linux/netfilter/nfnetlink_osf.h b/include/uapi/linux/netfilter/nfnetlink_osf.h
index 3b93fbb9fc24..76a3527df5dd 100644
--- a/include/uapi/linux/netfilter/nfnetlink_osf.h
+++ b/include/uapi/linux/netfilter/nfnetlink_osf.h
@@ -88,6 +88,18 @@ enum iana_options {
 	OSFOPT_EMPTY = 255,
 };
 
+/* Initial window size option state machine: multiple of mss, mtu or
+ * plain numeric value. Can also be made as plain numeric value which
+ * is not a multiple of specified value.
+ */
+enum nf_osf_window_size_options {
+	OSF_WSS_PLAIN	= 0,
+	OSF_WSS_MSS,
+	OSF_WSS_MTU,
+	OSF_WSS_MODULO,
+	OSF_WSS_MAX,
+};
+
 enum nf_osf_attr_type {
 	OSF_ATTR_UNSPEC,
 	OSF_ATTR_FINGER,
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index c56c59605c2b..24102b5286ec 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -46,6 +46,7 @@
 #define xt_osf_finger		nf_osf_finger
 #define xt_osf_nlmsg		nf_osf_nlmsg
 
+#define xt_osf_window_size_options	nf_osf_window_size_options
 #define xt_osf_attr_type	nf_osf_attr_type
 #define xt_osf_msg_types	nf_osf_msg_types
 
-- 
cgit v1.2.3


From 5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 8 Aug 2018 01:01:24 -0700
Subject: bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY

This patch introduces a new map type BPF_MAP_TYPE_REUSEPORT_SOCKARRAY.

To unleash the full potential of a bpf prog, it is essential for the
userspace to be capable of directly setting up a bpf map which can then
be consumed by the bpf prog to make decision.  In this case, decide which
SO_REUSEPORT sk to serve the incoming request.

By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control
and visibility on where a SO_REUSEPORT sk should be located in a bpf map.
The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that
the bpf prog can directly select a sk from the bpf map.  That will
raise the programmability of the bpf prog attached to a reuseport
group (a group of sk serving the same IP:PORT).

For example, in UDP, the bpf prog can peek into the payload (e.g.
through the "data" pointer introduced in the later patch) to learn
the application level's connection information and then decide which sk
to pick from a bpf map.  The userspace can tightly couple the sk's location
in a bpf map with the application logic in generating the UDP payload's
connection information.  This connection info contact/API stays within the
userspace.

Also, when used with map-in-map, the userspace can switch the
old-server-process's inner map to a new-server-process's inner map
in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)".
The bpf prog will then direct incoming requests to the new process instead
of the old process.  The old process can finish draining the pending
requests (e.g. by "accept()") before closing the old-fds.  [Note that
deleting a fd from a bpf map does not necessary mean the fd is closed]

During map_update_elem(),
Only SO_REUSEPORT sk (i.e. which has already been added
to a reuse->socks[]) can be used.  That means a SO_REUSEPORT sk that is
"bind()" for UDP or "bind()+listen()" for TCP.  These conditions are
ensured in "reuseport_array_update_check()".

A SO_REUSEPORT sk can only be added once to a map (i.e. the
same sk cannot be added twice even to the same map).  SO_REUSEPORT
already allows another sk to be created for the same IP:PORT.
There is no need to re-create a similar usage in the BPF side.

When a SO_REUSEPORT is deleted from the "reuse->socks[]" (e.g. "close()"),
it will notify the bpf map to remove it from the map also.  It is
done through "bpf_sk_reuseport_detach()" and it will only be called
if >=1 of the "reuse->sock[]" has ever been added to a bpf map.

The map_update()/map_delete() has to be in-sync with the
"reuse->socks[]".  Hence, the same "reuseport_lock" used
by "reuse->socks[]" has to be used here also. Care has
been taken to ensure the lock is only acquired when the
adding sk passes some strict tests. and
freeing the map does not require the reuseport_lock.

The reuseport_array will also support lookup from the syscall
side.  It will return a sock_gen_cookie().  The sock_gen_cookie()
is on-demand (i.e. a sk's cookie is not generated until the very
first map_lookup_elem()).

The lookup cookie is 64bits but it goes against the logical userspace
expectation on 32bits sizeof(fd) (and as other fd based bpf maps do also).
It may catch user in surprise if we enforce value_size=8 while
userspace still pass a 32bits fd during update.  Supporting different
value_size between lookup and update seems unintuitive also.

We also need to consider what if other existing fd based maps want
to return 64bits value from syscall's lookup in the future.
Hence, reuseport_array supports both value_size 4 and 8, and
assuming user will usually use value_size=4.  The syscall's lookup
will return ENOSPC on value_size=4.  It will will only
return 64bits value from sock_gen_cookie() when user consciously
choose value_size=8 (as a signal that lookup is desired) which then
requires a 64bits value in both lookup and update.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h          |  28 ++++
 include/linux/bpf_types.h    |   3 +
 include/uapi/linux/bpf.h     |   1 +
 kernel/bpf/Makefile          |   3 +
 kernel/bpf/arraymap.c        |   2 +-
 kernel/bpf/reuseport_array.c | 363 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c         |   6 +
 net/core/sock_reuseport.c    |   8 +
 8 files changed, 413 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/reuseport_array.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cd8790d2c6ed..db11662faea6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 }
 
 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
+int array_map_alloc_check(union bpf_attr *attr);
 
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
@@ -769,6 +770,33 @@ static inline void __xsk_map_flush(struct bpf_map *map)
 }
 #endif
 
+#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
+void bpf_sk_reuseport_detach(struct sock *sk);
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+				       void *value);
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags);
+#else
+static inline void bpf_sk_reuseport_detach(struct sock *sk)
+{
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
+						     void *key, void *value)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map,
+						     void *key, void *value,
+						     u64 map_flags)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BPF_SYSCALL */
+#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */
+
 /* verifier prototypes for helper functions called from eBPF programs */
 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index add08be53b6f..14fd6c02d258 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -60,4 +60,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
 BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
 #endif
+#ifdef CONFIG_INET
+BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
+#endif
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dd5758dc35d3..40f584bc7da0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -126,6 +126,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_XSKMAP,
 	BPF_MAP_TYPE_SOCKHASH,
 	BPF_MAP_TYPE_CGROUP_STORAGE,
+	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e8906cbad81f..0488b8258321 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -23,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
 obj-$(CONFIG_CGROUP_BPF) += cgroup.o
+ifeq ($(CONFIG_INET),y)
+obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
+endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2aa55d030c77..f6ca3e712831 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
 }
 
 /* Called from syscall */
-static int array_map_alloc_check(union bpf_attr *attr)
+int array_map_alloc_check(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
new file mode 100644
index 000000000000..18e225de80ff
--- /dev/null
+++ b/kernel/bpf/reuseport_array.c
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Facebook
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/sock_diag.h>
+#include <net/sock_reuseport.h>
+
+struct reuseport_array {
+	struct bpf_map map;
+	struct sock __rcu *ptrs[];
+};
+
+static struct reuseport_array *reuseport_array(struct bpf_map *map)
+{
+	return (struct reuseport_array *)map;
+}
+
+/* The caller must hold the reuseport_lock */
+void bpf_sk_reuseport_detach(struct sock *sk)
+{
+	struct sock __rcu **socks;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	socks = sk->sk_user_data;
+	if (socks) {
+		WRITE_ONCE(sk->sk_user_data, NULL);
+		/*
+		 * Do not move this NULL assignment outside of
+		 * sk->sk_callback_lock because there is
+		 * a race with reuseport_array_free()
+		 * which does not hold the reuseport_lock.
+		 */
+		RCU_INIT_POINTER(*socks, NULL);
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int reuseport_array_alloc_check(union bpf_attr *attr)
+{
+	if (attr->value_size != sizeof(u32) &&
+	    attr->value_size != sizeof(u64))
+		return -EINVAL;
+
+	return array_map_alloc_check(attr);
+}
+
+static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct reuseport_array *array = reuseport_array(map);
+	u32 index = *(u32 *)key;
+
+	if (unlikely(index >= array->map.max_entries))
+		return NULL;
+
+	return rcu_dereference(array->ptrs[index]);
+}
+
+/* Called from syscall only */
+static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+{
+	struct reuseport_array *array = reuseport_array(map);
+	u32 index = *(u32 *)key;
+	struct sock *sk;
+	int err;
+
+	if (index >= map->max_entries)
+		return -E2BIG;
+
+	if (!rcu_access_pointer(array->ptrs[index]))
+		return -ENOENT;
+
+	spin_lock_bh(&reuseport_lock);
+
+	sk = rcu_dereference_protected(array->ptrs[index],
+				       lockdep_is_held(&reuseport_lock));
+	if (sk) {
+		write_lock_bh(&sk->sk_callback_lock);
+		WRITE_ONCE(sk->sk_user_data, NULL);
+		RCU_INIT_POINTER(array->ptrs[index], NULL);
+		write_unlock_bh(&sk->sk_callback_lock);
+		err = 0;
+	} else {
+		err = -ENOENT;
+	}
+
+	spin_unlock_bh(&reuseport_lock);
+
+	return err;
+}
+
+static void reuseport_array_free(struct bpf_map *map)
+{
+	struct reuseport_array *array = reuseport_array(map);
+	struct sock *sk;
+	u32 i;
+
+	synchronize_rcu();
+
+	/*
+	 * ops->map_*_elem() will not be able to access this
+	 * array now. Hence, this function only races with
+	 * bpf_sk_reuseport_detach() which was triggerred by
+	 * close() or disconnect().
+	 *
+	 * This function and bpf_sk_reuseport_detach() are
+	 * both removing sk from "array".  Who removes it
+	 * first does not matter.
+	 *
+	 * The only concern here is bpf_sk_reuseport_detach()
+	 * may access "array" which is being freed here.
+	 * bpf_sk_reuseport_detach() access this "array"
+	 * through sk->sk_user_data _and_ with sk->sk_callback_lock
+	 * held which is enough because this "array" is not freed
+	 * until all sk->sk_user_data has stopped referencing this "array".
+	 *
+	 * Hence, due to the above, taking "reuseport_lock" is not
+	 * needed here.
+	 */
+
+	/*
+	 * Since reuseport_lock is not taken, sk is accessed under
+	 * rcu_read_lock()
+	 */
+	rcu_read_lock();
+	for (i = 0; i < map->max_entries; i++) {
+		sk = rcu_dereference(array->ptrs[i]);
+		if (sk) {
+			write_lock_bh(&sk->sk_callback_lock);
+			/*
+			 * No need for WRITE_ONCE(). At this point,
+			 * no one is reading it without taking the
+			 * sk->sk_callback_lock.
+			 */
+			sk->sk_user_data = NULL;
+			write_unlock_bh(&sk->sk_callback_lock);
+			RCU_INIT_POINTER(array->ptrs[i], NULL);
+		}
+	}
+	rcu_read_unlock();
+
+	/*
+	 * Once reaching here, all sk->sk_user_data is not
+	 * referenceing this "array".  "array" can be freed now.
+	 */
+	bpf_map_area_free(array);
+}
+
+static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
+{
+	int err, numa_node = bpf_map_attr_numa_node(attr);
+	struct reuseport_array *array;
+	u64 cost, array_size;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	array_size = sizeof(*array);
+	array_size += (u64)attr->max_entries * sizeof(struct sock *);
+
+	/* make sure there is no u32 overflow later in round_up() */
+	cost = array_size;
+	if (cost >= U32_MAX - PAGE_SIZE)
+		return ERR_PTR(-ENOMEM);
+	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	err = bpf_map_precharge_memlock(cost);
+	if (err)
+		return ERR_PTR(err);
+
+	/* allocate all map elements and zero-initialize them */
+	array = bpf_map_area_alloc(array_size, numa_node);
+	if (!array)
+		return ERR_PTR(-ENOMEM);
+
+	/* copy mandatory map attributes */
+	bpf_map_init_from_attr(&array->map, attr);
+	array->map.pages = cost;
+
+	return &array->map;
+}
+
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+				       void *value)
+{
+	struct sock *sk;
+	int err;
+
+	if (map->value_size != sizeof(u64))
+		return -ENOSPC;
+
+	rcu_read_lock();
+	sk = reuseport_array_lookup_elem(map, key);
+	if (sk) {
+		*(u64 *)value = sock_gen_cookie(sk);
+		err = 0;
+	} else {
+		err = -ENOENT;
+	}
+	rcu_read_unlock();
+
+	return err;
+}
+
+static int
+reuseport_array_update_check(const struct reuseport_array *array,
+			     const struct sock *nsk,
+			     const struct sock *osk,
+			     const struct sock_reuseport *nsk_reuse,
+			     u32 map_flags)
+{
+	if (osk && map_flags == BPF_NOEXIST)
+		return -EEXIST;
+
+	if (!osk && map_flags == BPF_EXIST)
+		return -ENOENT;
+
+	if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP)
+		return -ENOTSUPP;
+
+	if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6)
+		return -ENOTSUPP;
+
+	if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM)
+		return -ENOTSUPP;
+
+	/*
+	 * sk must be hashed (i.e. listening in the TCP case or binded
+	 * in the UDP case) and
+	 * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL).
+	 *
+	 * Also, sk will be used in bpf helper that is protected by
+	 * rcu_read_lock().
+	 */
+	if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse)
+		return -EINVAL;
+
+	/* READ_ONCE because the sk->sk_callback_lock may not be held here */
+	if (READ_ONCE(nsk->sk_user_data))
+		return -EBUSY;
+
+	return 0;
+}
+
+/*
+ * Called from syscall only.
+ * The "nsk" in the fd refcnt.
+ * The "osk" and "reuse" are protected by reuseport_lock.
+ */
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags)
+{
+	struct reuseport_array *array = reuseport_array(map);
+	struct sock *free_osk = NULL, *osk, *nsk;
+	struct sock_reuseport *reuse;
+	u32 index = *(u32 *)key;
+	struct socket *socket;
+	int err, fd;
+
+	if (map_flags > BPF_EXIST)
+		return -EINVAL;
+
+	if (index >= map->max_entries)
+		return -E2BIG;
+
+	if (map->value_size == sizeof(u64)) {
+		u64 fd64 = *(u64 *)value;
+
+		if (fd64 > S32_MAX)
+			return -EINVAL;
+		fd = fd64;
+	} else {
+		fd = *(int *)value;
+	}
+
+	socket = sockfd_lookup(fd, &err);
+	if (!socket)
+		return err;
+
+	nsk = socket->sk;
+	if (!nsk) {
+		err = -EINVAL;
+		goto put_file;
+	}
+
+	/* Quick checks before taking reuseport_lock */
+	err = reuseport_array_update_check(array, nsk,
+					   rcu_access_pointer(array->ptrs[index]),
+					   rcu_access_pointer(nsk->sk_reuseport_cb),
+					   map_flags);
+	if (err)
+		goto put_file;
+
+	spin_lock_bh(&reuseport_lock);
+	/*
+	 * Some of the checks only need reuseport_lock
+	 * but it is done under sk_callback_lock also
+	 * for simplicity reason.
+	 */
+	write_lock_bh(&nsk->sk_callback_lock);
+
+	osk = rcu_dereference_protected(array->ptrs[index],
+					lockdep_is_held(&reuseport_lock));
+	reuse = rcu_dereference_protected(nsk->sk_reuseport_cb,
+					  lockdep_is_held(&reuseport_lock));
+	err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags);
+	if (err)
+		goto put_file_unlock;
+
+	/* Ensure reuse->reuseport_id is set */
+	err = reuseport_get_id(reuse);
+	if (err < 0)
+		goto put_file_unlock;
+
+	WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]);
+	rcu_assign_pointer(array->ptrs[index], nsk);
+	free_osk = osk;
+	err = 0;
+
+put_file_unlock:
+	write_unlock_bh(&nsk->sk_callback_lock);
+
+	if (free_osk) {
+		write_lock_bh(&free_osk->sk_callback_lock);
+		WRITE_ONCE(free_osk->sk_user_data, NULL);
+		write_unlock_bh(&free_osk->sk_callback_lock);
+	}
+
+	spin_unlock_bh(&reuseport_lock);
+put_file:
+	fput(socket->file);
+	return err;
+}
+
+/* Called from syscall */
+static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
+					void *next_key)
+{
+	struct reuseport_array *array = reuseport_array(map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = (u32 *)next_key;
+
+	if (index >= array->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == array->map.max_entries - 1)
+		return -ENOENT;
+
+	*next = index + 1;
+	return 0;
+}
+
+const struct bpf_map_ops reuseport_array_ops = {
+	.map_alloc_check = reuseport_array_alloc_check,
+	.map_alloc = reuseport_array_alloc,
+	.map_free = reuseport_array_free,
+	.map_lookup_elem = reuseport_array_lookup_elem,
+	.map_get_next_key = reuseport_array_get_next_key,
+	.map_delete_elem = reuseport_array_delete_elem,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5af4e9e2722d..57f4d076141b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -684,6 +684,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_fd_array_map_lookup_elem(map, key, value);
 	} else if (IS_FD_HASH(map)) {
 		err = bpf_fd_htab_map_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
 	} else {
 		rcu_read_lock();
 		ptr = map->ops->map_lookup_elem(map, key);
@@ -790,6 +792,10 @@ static int map_update_elem(union bpf_attr *attr)
 		err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
 						  attr->flags);
 		rcu_read_unlock();
+	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+		/* rcu_read_lock() is not needed */
+		err = bpf_fd_reuseport_array_update_elem(map, key, value,
+							 attr->flags);
 	} else {
 		rcu_read_lock();
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index cf2e4d305af9..8235f2439816 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -186,6 +186,14 @@ void reuseport_detach_sock(struct sock *sk)
 	spin_lock_bh(&reuseport_lock);
 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 					  lockdep_is_held(&reuseport_lock));
+
+	/* At least one of the sk in this reuseport group is added to
+	 * a bpf map.  Notify the bpf side.  The bpf map logic will
+	 * remove the sk if it is indeed added to a bpf map.
+	 */
+	if (reuse->reuseport_id)
+		bpf_sk_reuseport_detach(sk);
+
 	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
 
 	for (i = 0; i < reuse->num_socks; i++) {
-- 
cgit v1.2.3


From 2dbb9b9e6df67d444fbe425c7f6014858d337adf Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 8 Aug 2018 01:01:25 -0700
Subject: bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT

This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select
a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY.  Like other
non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN.

BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern"
to store the bpf context instead of using the skb->cb[48].

At the SO_REUSEPORT sk lookup time, it is in the middle of transiting
from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp).  At this
point,  it is not always clear where the bpf context can be appended
in the skb->cb[48] to avoid saving-and-restoring cb[].  Even putting
aside the difference between ipv4-vs-ipv6 and udp-vs-tcp.  It is not
clear if the lower layer is only ipv4 and ipv6 in the future and
will it not touch the cb[] again before transiting to the upper
layer.

For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB
instead of IP[6]CB and it may still modify the cb[] after calling
the udp[46]_lib_lookup_skb().  Because of the above reason, if
sk->cb is used for the bpf ctx, saving-and-restoring is needed
and likely the whole 48 bytes cb[] has to be saved and restored.

Instead of saving, setting and restoring the cb[], this patch opts
to create a new "struct sk_reuseport_kern" and setting the needed
values in there.

The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)"
will serve all ipv4/ipv6 + udp/tcp combinations.  There is no protocol
specific usage at this point and it is also inline with the current
sock_reuseport.c implementation (i.e. no protocol specific requirement).

In "struct sk_reuseport_md", this patch exposes data/data_end/len
with semantic similar to other existing usages.  Together
with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()",
the bpf prog can peek anywhere in the skb.  The "bind_inany" tells
the bpf prog that the reuseport group is bind-ed to a local
INANY address which cannot be learned from skb.

The new "bind_inany" is added to "struct sock_reuseport" which will be
used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order
to avoid repeating the "bind INANY" test on
"sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run.  It can
only be properly initialized when a "sk->sk_reuseport" enabled sk is
adding to a hashtable (i.e. during "reuseport_alloc()" and
"reuseport_add_sock()").

The new "sk_select_reuseport()" is the main helper that the
bpf prog will use to select a SO_REUSEPORT sk.  It is the only function
that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY.  As mentioned in
the earlier patch, the validity of a selected sk is checked in
run time in "sk_select_reuseport()".  Doing the check in
verification time is difficult and inflexible (consider the map-in-map
use case).  The runtime check is to compare the selected sk's reuseport_id
with the reuseport_id that we want.  This helper will return -EXXX if the
selected sk cannot serve the incoming request (e.g. reuseport_id
not match).  The bpf prog can decide if it wants to do SK_DROP as its
discretion.

When the bpf prog returns SK_PASS, the kernel will check if a
valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL").
If it does , it will use the selected sk.  If not, the kernel
will select one from "reuse->socks[]" (as before this patch).

The SK_DROP and SK_PASS handling logic will be in the next patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_types.h       |   3 +
 include/linux/filter.h          |  15 +++
 include/net/addrconf.h          |   1 +
 include/net/sock_reuseport.h    |   6 +-
 include/uapi/linux/bpf.h        |  36 +++++-
 kernel/bpf/verifier.c           |   9 ++
 net/core/filter.c               | 269 +++++++++++++++++++++++++++++++++++++++-
 net/core/sock_reuseport.c       |  20 ++-
 net/ipv4/inet_connection_sock.c |   9 ++
 net/ipv4/inet_hashtables.c      |   5 +-
 net/ipv4/udp.c                  |   5 +-
 11 files changed, 365 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 14fd6c02d258..cd26c090e7c0 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #ifdef CONFIG_BPF_LIRC_MODE2
 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
 #endif
+#ifdef CONFIG_INET
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2b072dab32c0..70e9d57677fe 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -32,6 +32,7 @@ struct seccomp_data;
 struct bpf_prog_aux;
 struct xdp_rxq_info;
 struct xdp_buff;
+struct sock_reuseport;
 
 /* ArgX, context and stack frame pointer register positions. Note,
  * Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -833,6 +834,20 @@ void bpf_warn_invalid_xdp_action(u32 act);
 struct sock *do_sk_redirect_map(struct sk_buff *skb);
 struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
 
+#ifdef CONFIG_INET
+struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+				  struct bpf_prog *prog, struct sk_buff *skb,
+				  u32 hash);
+#else
+static inline struct sock *
+bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+		     struct bpf_prog *prog, struct sk_buff *skb,
+		     u32 hash)
+{
+	return NULL;
+}
+#endif
+
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
 extern int bpf_jit_harden;
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 5f43f7a70fe6..6def0351bcc3 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -108,6 +108,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
 		    u32 banned_flags);
 bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 			  bool match_wildcard);
+bool inet_rcv_saddr_any(const struct sock *sk);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
 
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index e1a7681856f7..73b569556be6 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -21,12 +21,14 @@ struct sock_reuseport {
 	unsigned int		synq_overflow_ts;
 	/* ID stays the same even after the size of socks[] grows. */
 	unsigned int		reuseport_id;
+	bool			bind_inany;
 	struct bpf_prog __rcu	*prog;		/* optional BPF sock selector */
 	struct sock		*socks[0];	/* array of sock pointers */
 };
 
-extern int reuseport_alloc(struct sock *sk);
-extern int reuseport_add_sock(struct sock *sk, struct sock *sk2);
+extern int reuseport_alloc(struct sock *sk, bool bind_inany);
+extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
+			      bool bind_inany);
 extern void reuseport_detach_sock(struct sock *sk);
 extern struct sock *reuseport_select_sock(struct sock *sk,
 					  u32 hash,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 40f584bc7da0..3102a2a23c31 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -151,6 +151,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 	BPF_PROG_TYPE_LIRC_MODE2,
+	BPF_PROG_TYPE_SK_REUSEPORT,
 };
 
 enum bpf_attach_type {
@@ -2114,6 +2115,14 @@ union bpf_attr {
  *		the shared data.
  *	Return
  *		Pointer to the local storage area.
+ *
+ * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Select a SO_REUSEPORT sk from a	BPF_MAP_TYPE_REUSEPORT_ARRAY map
+ *		It checks the selected sk is matching the incoming
+ *		request in the skb.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2197,7 +2206,8 @@ union bpf_attr {
 	FN(rc_keydown),			\
 	FN(skb_cgroup_id),		\
 	FN(get_current_cgroup_id),	\
-	FN(get_local_storage),
+	FN(get_local_storage),		\
+	FN(sk_select_reuseport),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2414,6 +2424,30 @@ struct sk_msg_md {
 	__u32 local_port;	/* stored in host byte order */
 };
 
+struct sk_reuseport_md {
+	/*
+	 * Start of directly accessible data. It begins from
+	 * the tcp/udp header.
+	 */
+	void *data;
+	void *data_end;		/* End of directly accessible data */
+	/*
+	 * Total length of packet (starting from the tcp/udp header).
+	 * Note that the directly accessible bytes (data_end - data)
+	 * could be less than this "len".  Those bytes could be
+	 * indirectly read by a helper "bpf_skb_load_bytes()".
+	 */
+	__u32 len;
+	/*
+	 * Eth protocol in the mac header (network byte order). e.g.
+	 * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
+	 */
+	__u32 eth_protocol;
+	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
+	__u32 bind_inany;	/* Is sock bound to an INANY address? */
+	__u32 hash;		/* A hash of the packet 4 tuples */
+};
+
 #define BPF_TAG_SIZE	8
 
 struct bpf_prog_info {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 587468a9c37d..ca90679a7fe5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	case BPF_PROG_TYPE_LWT_IN:
 	case BPF_PROG_TYPE_LWT_OUT:
 	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+	case BPF_PROG_TYPE_SK_REUSEPORT:
 		/* dst_input() and dst_output() can't write for now */
 		if (t == BPF_WRITE)
 			return false;
@@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_msg_redirect_hash)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+		if (func_id != BPF_FUNC_sk_select_reuseport)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
 			goto error;
 		break;
+	case BPF_FUNC_sk_select_reuseport:
+		if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)
+			goto error;
+		break;
 	default:
 		break;
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index 2de7dd9f2a57..142595b4e0d1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1462,7 +1462,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
 		return -ENOMEM;
 
 	if (sk_unhashed(sk) && sk->sk_reuseport) {
-		err = reuseport_alloc(sk);
+		err = reuseport_alloc(sk, false);
 		if (err)
 			return err;
 	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
@@ -7013,3 +7013,270 @@ out:
 	release_sock(sk);
 	return ret;
 }
+
+#ifdef CONFIG_INET
+struct sk_reuseport_kern {
+	struct sk_buff *skb;
+	struct sock *sk;
+	struct sock *selected_sk;
+	void *data_end;
+	u32 hash;
+	u32 reuseport_id;
+	bool bind_inany;
+};
+
+static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
+				    struct sock_reuseport *reuse,
+				    struct sock *sk, struct sk_buff *skb,
+				    u32 hash)
+{
+	reuse_kern->skb = skb;
+	reuse_kern->sk = sk;
+	reuse_kern->selected_sk = NULL;
+	reuse_kern->data_end = skb->data + skb_headlen(skb);
+	reuse_kern->hash = hash;
+	reuse_kern->reuseport_id = reuse->reuseport_id;
+	reuse_kern->bind_inany = reuse->bind_inany;
+}
+
+struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+				  struct bpf_prog *prog, struct sk_buff *skb,
+				  u32 hash)
+{
+	struct sk_reuseport_kern reuse_kern;
+	enum sk_action action;
+
+	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
+	action = BPF_PROG_RUN(prog, &reuse_kern);
+
+	if (action == SK_PASS)
+		return reuse_kern.selected_sk;
+	else
+		return ERR_PTR(-ECONNREFUSED);
+}
+
+BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
+	   struct bpf_map *, map, void *, key, u32, flags)
+{
+	struct sock_reuseport *reuse;
+	struct sock *selected_sk;
+
+	selected_sk = map->ops->map_lookup_elem(map, key);
+	if (!selected_sk)
+		return -ENOENT;
+
+	reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
+	if (!reuse)
+		/* selected_sk is unhashed (e.g. by close()) after the
+		 * above map_lookup_elem().  Treat selected_sk has already
+		 * been removed from the map.
+		 */
+		return -ENOENT;
+
+	if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
+		struct sock *sk;
+
+		if (unlikely(!reuse_kern->reuseport_id))
+			/* There is a small race between adding the
+			 * sk to the map and setting the
+			 * reuse_kern->reuseport_id.
+			 * Treat it as the sk has not been added to
+			 * the bpf map yet.
+			 */
+			return -ENOENT;
+
+		sk = reuse_kern->sk;
+		if (sk->sk_protocol != selected_sk->sk_protocol)
+			return -EPROTOTYPE;
+		else if (sk->sk_family != selected_sk->sk_family)
+			return -EAFNOSUPPORT;
+
+		/* Catch all. Likely bound to a different sockaddr. */
+		return -EBADFD;
+	}
+
+	reuse_kern->selected_sk = selected_sk;
+
+	return 0;
+}
+
+static const struct bpf_func_proto sk_select_reuseport_proto = {
+	.func           = sk_select_reuseport,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(sk_reuseport_load_bytes,
+	   const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+	   void *, to, u32, len)
+{
+	return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
+	.func		= sk_reuseport_load_bytes,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(sk_reuseport_load_bytes_relative,
+	   const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+	   void *, to, u32, len, u32, start_header)
+{
+	return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
+					       len, start_header);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
+	.func		= sk_reuseport_load_bytes_relative,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_reuseport_func_proto(enum bpf_func_id func_id,
+			const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_sk_select_reuseport:
+		return &sk_select_reuseport_proto;
+	case BPF_FUNC_skb_load_bytes:
+		return &sk_reuseport_load_bytes_proto;
+	case BPF_FUNC_skb_load_bytes_relative:
+		return &sk_reuseport_load_bytes_relative_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static bool
+sk_reuseport_is_valid_access(int off, int size,
+			     enum bpf_access_type type,
+			     const struct bpf_prog *prog,
+			     struct bpf_insn_access_aux *info)
+{
+	const u32 size_default = sizeof(__u32);
+
+	if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
+	    off % size || type != BPF_READ)
+		return false;
+
+	switch (off) {
+	case offsetof(struct sk_reuseport_md, data):
+		info->reg_type = PTR_TO_PACKET;
+		return size == sizeof(__u64);
+
+	case offsetof(struct sk_reuseport_md, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		return size == sizeof(__u64);
+
+	case offsetof(struct sk_reuseport_md, hash):
+		return size == size_default;
+
+	/* Fields that allow narrowing */
+	case offsetof(struct sk_reuseport_md, eth_protocol):
+		if (size < FIELD_SIZEOF(struct sk_buff, protocol))
+			return false;
+	case offsetof(struct sk_reuseport_md, ip_protocol):
+	case offsetof(struct sk_reuseport_md, bind_inany):
+	case offsetof(struct sk_reuseport_md, len):
+		bpf_ctx_record_field_size(info, size_default);
+		return bpf_ctx_narrow_access_ok(off, size, size_default);
+
+	default:
+		return false;
+	}
+}
+
+#define SK_REUSEPORT_LOAD_FIELD(F) ({					\
+	*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+			      si->dst_reg, si->src_reg,			\
+			      bpf_target_off(struct sk_reuseport_kern, F, \
+					     FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+					     target_size));		\
+	})
+
+#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD)				\
+	SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,		\
+				    struct sk_buff,			\
+				    skb,				\
+				    SKB_FIELD)
+
+#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \
+	SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern,	\
+					     struct sock,		\
+					     sk,			\
+					     SK_FIELD, BPF_SIZE, EXTRA_OFF)
+
+static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
+					   const struct bpf_insn *si,
+					   struct bpf_insn *insn_buf,
+					   struct bpf_prog *prog,
+					   u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct sk_reuseport_md, data):
+		SK_REUSEPORT_LOAD_SKB_FIELD(data);
+		break;
+
+	case offsetof(struct sk_reuseport_md, len):
+		SK_REUSEPORT_LOAD_SKB_FIELD(len);
+		break;
+
+	case offsetof(struct sk_reuseport_md, eth_protocol):
+		SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
+		break;
+
+	case offsetof(struct sk_reuseport_md, ip_protocol):
+		BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
+		SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset,
+						    BPF_W, 0);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
+					SK_FL_PROTO_SHIFT);
+		/* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian
+		 * aware.  No further narrowing or masking is needed.
+		 */
+		*target_size = 1;
+		break;
+
+	case offsetof(struct sk_reuseport_md, data_end):
+		SK_REUSEPORT_LOAD_FIELD(data_end);
+		break;
+
+	case offsetof(struct sk_reuseport_md, hash):
+		SK_REUSEPORT_LOAD_FIELD(hash);
+		break;
+
+	case offsetof(struct sk_reuseport_md, bind_inany):
+		SK_REUSEPORT_LOAD_FIELD(bind_inany);
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
+	.get_func_proto		= sk_reuseport_func_proto,
+	.is_valid_access	= sk_reuseport_is_valid_access,
+	.convert_ctx_access	= sk_reuseport_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sk_reuseport_prog_ops = {
+};
+#endif /* CONFIG_INET */
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 8235f2439816..d260167f5f77 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -51,7 +51,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
 	return reuse;
 }
 
-int reuseport_alloc(struct sock *sk)
+int reuseport_alloc(struct sock *sk, bool bind_inany)
 {
 	struct sock_reuseport *reuse;
 
@@ -63,9 +63,17 @@ int reuseport_alloc(struct sock *sk)
 	/* Allocation attempts can occur concurrently via the setsockopt path
 	 * and the bind/hash path.  Nothing to do when we lose the race.
 	 */
-	if (rcu_dereference_protected(sk->sk_reuseport_cb,
-				      lockdep_is_held(&reuseport_lock)))
+	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+					  lockdep_is_held(&reuseport_lock));
+	if (reuse) {
+		/* Only set reuse->bind_inany if the bind_inany is true.
+		 * Otherwise, it will overwrite the reuse->bind_inany
+		 * which was set by the bind/hash path.
+		 */
+		if (bind_inany)
+			reuse->bind_inany = bind_inany;
 		goto out;
+	}
 
 	reuse = __reuseport_alloc(INIT_SOCKS);
 	if (!reuse) {
@@ -75,6 +83,7 @@ int reuseport_alloc(struct sock *sk)
 
 	reuse->socks[0] = sk;
 	reuse->num_socks = 1;
+	reuse->bind_inany = bind_inany;
 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 
 out:
@@ -101,6 +110,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
 	more_reuse->num_socks = reuse->num_socks;
 	more_reuse->prog = reuse->prog;
 	more_reuse->reuseport_id = reuse->reuseport_id;
+	more_reuse->bind_inany = reuse->bind_inany;
 
 	memcpy(more_reuse->socks, reuse->socks,
 	       reuse->num_socks * sizeof(struct sock *));
@@ -136,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head)
  *  @sk2: Socket belonging to the existing reuseport group.
  *  May return ENOMEM and not add socket to group under memory pressure.
  */
-int reuseport_add_sock(struct sock *sk, struct sock *sk2)
+int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 {
 	struct sock_reuseport *old_reuse, *reuse;
 
 	if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
-		int err = reuseport_alloc(sk2);
+		int err = reuseport_alloc(sk2, bind_inany);
 
 		if (err)
 			return err;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 33a88e045efd..dfd5009f96ef 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 }
 EXPORT_SYMBOL(inet_rcv_saddr_equal);
 
+bool inet_rcv_saddr_any(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+#endif
+	return !sk->sk_rcv_saddr;
+}
+
 void inet_get_local_port_range(struct net *net, int *low, int *high)
 {
 	unsigned int seq;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 3647167c8fa3..370e24463fb7 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -567,10 +567,11 @@ static int inet_reuseport_add_sock(struct sock *sk,
 		    inet_csk(sk2)->icsk_bind_hash == tb &&
 		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
 		    inet_rcv_saddr_equal(sk, sk2, false))
-			return reuseport_add_sock(sk, sk2);
+			return reuseport_add_sock(sk, sk2,
+						  inet_rcv_saddr_any(sk));
 	}
 
-	return reuseport_alloc(sk);
+	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
 }
 
 int __inet_hash(struct sock *sk, struct sock *osk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 060e841dde40..038dd7909051 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
 		    (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
 		    inet_rcv_saddr_equal(sk, sk2, false)) {
-			return reuseport_add_sock(sk, sk2);
+			return reuseport_add_sock(sk, sk2,
+						  inet_rcv_saddr_any(sk));
 		}
 	}
 
-	return reuseport_alloc(sk);
+	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
 }
 
 /**
-- 
cgit v1.2.3


From b0e29063dcb3bf14f515f95e748b60e4bab45e7c Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 10 Aug 2018 13:22:01 +0200
Subject: l2tp: remove pppol2tp_session_ioctl()

pppol2tp_ioctl() has everything in place for handling PPPIOCGL2TPSTATS
on session sockets. We just need to copy the stats and set ->session_id.

As a side effect of sharing session and tunnel code, ->using_ipsec is
properly set even when the request was made using a session socket.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ppp-ioctl.h |  2 +-
 net/l2tp/l2tp_ppp.c            | 50 +++---------------------------------------
 2 files changed, 4 insertions(+), 48 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ppp-ioctl.h b/include/uapi/linux/ppp-ioctl.h
index 784c2e3e572e..88b5f9990320 100644
--- a/include/uapi/linux/ppp-ioctl.h
+++ b/include/uapi/linux/ppp-ioctl.h
@@ -68,7 +68,7 @@ struct ppp_option_data {
 struct pppol2tp_ioc_stats {
 	__u16		tunnel_id;	/* redundant */
 	__u16		session_id;	/* if zero, get tunnel stats */
-	__u32		using_ipsec:1;	/* valid only for session_id == 0 */
+	__u32		using_ipsec:1;
 	__aligned_u64	tx_packets;
 	__aligned_u64	tx_bytes;
 	__aligned_u64	tx_errors;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 2afd3ab8a551..bdfbd3ed7e14 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1068,52 +1068,6 @@ static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats,
 	return 0;
 }
 
-/* Session ioctl helper.
- */
-static int pppol2tp_session_ioctl(struct l2tp_session *session,
-				  unsigned int cmd, unsigned long arg)
-{
-	int err = 0;
-	struct sock *sk;
-	struct l2tp_tunnel *tunnel = session->tunnel;
-	struct pppol2tp_ioc_stats stats;
-
-	l2tp_dbg(session, L2TP_MSG_CONTROL,
-		 "%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n",
-		 session->name, cmd, arg);
-
-	sk = pppol2tp_session_get_sock(session);
-	if (!sk)
-		return -EBADR;
-
-	switch (cmd) {
-	case PPPIOCGL2TPSTATS:
-		err = -ENXIO;
-		if (!(sk->sk_state & PPPOX_CONNECTED))
-			break;
-
-		memset(&stats, 0, sizeof(stats));
-		stats.tunnel_id = tunnel->tunnel_id;
-		stats.session_id = session->session_id;
-		pppol2tp_copy_stats(&stats, &session->stats);
-		if (copy_to_user((void __user *) arg, &stats,
-				 sizeof(stats)))
-			break;
-		l2tp_info(session, L2TP_MSG_CONTROL, "%s: get L2TP stats\n",
-			  session->name);
-		err = 0;
-		break;
-
-	default:
-		err = -ENOSYS;
-		break;
-	}
-
-	sock_put(sk);
-
-	return err;
-}
-
 static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
 			  unsigned long arg)
 {
@@ -1172,7 +1126,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
 
 			stats.session_id = session_id;
 		} else {
-			return pppol2tp_session_ioctl(session, cmd, arg);
+			memset(&stats, 0, sizeof(stats));
+			pppol2tp_copy_stats(&stats, &session->stats);
+			stats.session_id = session->session_id;
 		}
 		stats.tunnel_id = session->tunnel->tunnel_id;
 		stats.using_ipsec = l2tp_tunnel_uses_xfrm(session->tunnel);
-- 
cgit v1.2.3


From 6f10f7d1b02b1bbc305f88d7696445dd38b13881 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Sat, 11 Aug 2018 13:19:44 +0800
Subject: bcache: style fix to replace 'unsigned' by 'unsigned int'

This patch fixes warning reported by checkpatch.pl by replacing 'unsigned'
with 'unsigned int'.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c     |  36 +++++++------
 drivers/md/bcache/bcache.h    | 107 +++++++++++++++++++--------------------
 drivers/md/bcache/bset.c      | 114 ++++++++++++++++++++++--------------------
 drivers/md/bcache/bset.h      |  34 ++++++-------
 drivers/md/bcache/btree.c     |  50 +++++++++---------
 drivers/md/bcache/btree.h     |   4 +-
 drivers/md/bcache/closure.h   |   2 +-
 drivers/md/bcache/debug.c     |   6 +--
 drivers/md/bcache/extents.c   |  22 ++++----
 drivers/md/bcache/io.c        |  18 +++----
 drivers/md/bcache/journal.c   |  20 ++++----
 drivers/md/bcache/journal.h   |   8 +--
 drivers/md/bcache/movinggc.c  |  12 ++---
 drivers/md/bcache/request.c   |  42 ++++++++--------
 drivers/md/bcache/request.h   |  18 +++----
 drivers/md/bcache/stats.c     |  12 ++---
 drivers/md/bcache/stats.h     |   2 +-
 drivers/md/bcache/super.c     |  34 ++++++-------
 drivers/md/bcache/sysfs.c     |  18 +++----
 drivers/md/bcache/util.h      |   9 ++--
 drivers/md/bcache/writeback.c |  19 +++----
 drivers/md/bcache/writeback.h |  12 ++---
 include/uapi/linux/bcache.h   |   6 +--
 23 files changed, 309 insertions(+), 296 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 7fa2631b422c..89f663d22551 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -87,8 +87,8 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
 {
 	struct cache *ca;
 	struct bucket *b;
-	unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
-	unsigned i;
+	unsigned int next = c->nbuckets * c->sb.bucket_size / 1024;
+	unsigned int i;
 	int r;
 
 	atomic_sub(sectors, &c->rescale);
@@ -169,7 +169,7 @@ static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 
 #define bucket_prio(b)							\
 ({									\
-	unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;	\
+	unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;	\
 									\
 	(b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b);	\
 })
@@ -301,7 +301,7 @@ do {									\
 
 static int bch_allocator_push(struct cache *ca, long bucket)
 {
-	unsigned i;
+	unsigned int i;
 
 	/* Prios/gens are actually the most important reserve */
 	if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
@@ -385,7 +385,7 @@ out:
 
 /* Allocation */
 
-long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
+long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
 {
 	DEFINE_WAIT(w);
 	struct bucket *b;
@@ -421,7 +421,7 @@ out:
 	if (expensive_debug_checks(ca->set)) {
 		size_t iter;
 		long i;
-		unsigned j;
+		unsigned int j;
 
 		for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
 			BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
@@ -470,14 +470,14 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b)
 
 void bch_bucket_free(struct cache_set *c, struct bkey *k)
 {
-	unsigned i;
+	unsigned int i;
 
 	for (i = 0; i < KEY_PTRS(k); i++)
 		__bch_bucket_free(PTR_CACHE(c, k, i),
 				  PTR_BUCKET(c, k, i));
 }
 
-int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
+int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
 			   struct bkey *k, int n, bool wait)
 {
 	int i;
@@ -510,7 +510,7 @@ err:
 	return -1;
 }
 
-int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
+int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
 			 struct bkey *k, int n, bool wait)
 {
 	int ret;
@@ -524,8 +524,8 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
 
 struct open_bucket {
 	struct list_head	list;
-	unsigned		last_write_point;
-	unsigned		sectors_free;
+	unsigned int		last_write_point;
+	unsigned int		sectors_free;
 	BKEY_PADDED(key);
 };
 
@@ -556,7 +556,7 @@ struct open_bucket {
  */
 static struct open_bucket *pick_data_bucket(struct cache_set *c,
 					    const struct bkey *search,
-					    unsigned write_point,
+					    unsigned int write_point,
 					    struct bkey *alloc)
 {
 	struct open_bucket *ret, *ret_task = NULL;
@@ -595,12 +595,16 @@ found:
  *
  * If s->writeback is true, will not fail.
  */
-bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
-		       unsigned write_point, unsigned write_prio, bool wait)
+bool bch_alloc_sectors(struct cache_set *c,
+		       struct bkey *k,
+		       unsigned int sectors,
+		       unsigned int write_point,
+		       unsigned int write_prio,
+		       bool wait)
 {
 	struct open_bucket *b;
 	BKEY_PADDED(key) alloc;
-	unsigned i;
+	unsigned int i;
 
 	/*
 	 * We might have to allocate a new bucket, which we can't do with a
@@ -613,7 +617,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
 	spin_lock(&c->data_bucket_lock);
 
 	while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
-		unsigned watermark = write_prio
+		unsigned int watermark = write_prio
 			? RESERVE_MOVINGGC
 			: RESERVE_NONE;
 
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 05f82ff6f016..1ebd2d9d90d5 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -252,7 +252,7 @@ struct bcache_device {
 	struct kobject		kobj;
 
 	struct cache_set	*c;
-	unsigned		id;
+	unsigned int		id;
 #define BCACHEDEVNAME_SIZE	12
 	char			name[BCACHEDEVNAME_SIZE];
 
@@ -264,18 +264,18 @@ struct bcache_device {
 #define BCACHE_DEV_UNLINK_DONE		2
 #define BCACHE_DEV_WB_RUNNING		3
 #define BCACHE_DEV_RATE_DW_RUNNING	4
-	unsigned		nr_stripes;
-	unsigned		stripe_size;
+	unsigned int		nr_stripes;
+	unsigned int		stripe_size;
 	atomic_t		*stripe_sectors_dirty;
 	unsigned long		*full_dirty_stripes;
 
 	struct bio_set		bio_split;
 
-	unsigned		data_csum:1;
+	unsigned int		data_csum:1;
 
 	int (*cache_miss)(struct btree *, struct search *,
-			  struct bio *, unsigned);
-	int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
+			  struct bio *, unsigned int);
+	int (*ioctl) (struct bcache_device *, fmode_t, unsigned int, unsigned long);
 };
 
 struct io {
@@ -284,7 +284,7 @@ struct io {
 	struct list_head	lru;
 
 	unsigned long		jiffies;
-	unsigned		sequential;
+	unsigned int		sequential;
 	sector_t		last;
 };
 
@@ -358,18 +358,18 @@ struct cached_dev {
 	struct cache_accounting	accounting;
 
 	/* The rest of this all shows up in sysfs */
-	unsigned		sequential_cutoff;
-	unsigned		readahead;
+	unsigned int		sequential_cutoff;
+	unsigned int		readahead;
 
-	unsigned		io_disable:1;
-	unsigned		verify:1;
-	unsigned		bypass_torture_test:1;
+	unsigned int		io_disable:1;
+	unsigned int		verify:1;
+	unsigned int		bypass_torture_test:1;
 
-	unsigned		partial_stripes_expensive:1;
-	unsigned		writeback_metadata:1;
-	unsigned		writeback_running:1;
+	unsigned int		partial_stripes_expensive:1;
+	unsigned int		writeback_metadata:1;
+	unsigned int		writeback_running:1;
 	unsigned char		writeback_percent;
-	unsigned		writeback_delay;
+	unsigned int		writeback_delay;
 
 	uint64_t		writeback_rate_target;
 	int64_t			writeback_rate_proportional;
@@ -377,16 +377,16 @@ struct cached_dev {
 	int64_t			writeback_rate_integral_scaled;
 	int32_t			writeback_rate_change;
 
-	unsigned		writeback_rate_update_seconds;
-	unsigned		writeback_rate_i_term_inverse;
-	unsigned		writeback_rate_p_term_inverse;
-	unsigned		writeback_rate_minimum;
+	unsigned int		writeback_rate_update_seconds;
+	unsigned int		writeback_rate_i_term_inverse;
+	unsigned int		writeback_rate_p_term_inverse;
+	unsigned int		writeback_rate_minimum;
 
 	enum stop_on_failure	stop_when_cache_set_failed;
 #define DEFAULT_CACHED_DEV_ERROR_LIMIT	64
 	atomic_t		io_errors;
-	unsigned		error_limit;
-	unsigned		offline_seconds;
+	unsigned int		error_limit;
+	unsigned int		offline_seconds;
 
 	char			backing_dev_name[BDEVNAME_SIZE];
 };
@@ -447,7 +447,7 @@ struct cache {
 	 * until a gc finishes - otherwise we could pointlessly burn a ton of
 	 * cpu
 	 */
-	unsigned		invalidate_needs_gc;
+	unsigned int		invalidate_needs_gc;
 
 	bool			discard; /* Get rid of? */
 
@@ -472,7 +472,7 @@ struct gc_stat {
 
 	size_t			nkeys;
 	uint64_t		data;	/* sectors */
-	unsigned		in_use; /* percent */
+	unsigned int		in_use; /* percent */
 };
 
 /*
@@ -518,7 +518,7 @@ struct cache_set {
 	int			caches_loaded;
 
 	struct bcache_device	**devices;
-	unsigned		devices_max_used;
+	unsigned int		devices_max_used;
 	atomic_t		attached_dev_nr;
 	struct list_head	cached_devs;
 	uint64_t		cached_dev_sectors;
@@ -548,7 +548,7 @@ struct cache_set {
 	 * Default number of pages for a new btree node - may be less than a
 	 * full bucket
 	 */
-	unsigned		btree_pages;
+	unsigned int		btree_pages;
 
 	/*
 	 * Lists of struct btrees; lru is the list for structs that have memory
@@ -571,7 +571,7 @@ struct cache_set {
 	struct list_head	btree_cache_freed;
 
 	/* Number of elements in btree_cache + btree_cache_freeable lists */
-	unsigned		btree_cache_used;
+	unsigned int		btree_cache_used;
 
 	/*
 	 * If we need to allocate memory for a new btree node and that
@@ -649,7 +649,7 @@ struct cache_set {
 	struct mutex		verify_lock;
 #endif
 
-	unsigned		nr_uuids;
+	unsigned int		nr_uuids;
 	struct uuid_entry	*uuids;
 	BKEY_PADDED(uuid_bucket);
 	struct closure		uuid_write;
@@ -670,12 +670,12 @@ struct cache_set {
 	struct journal		journal;
 
 #define CONGESTED_MAX		1024
-	unsigned		congested_last_us;
+	unsigned int		congested_last_us;
 	atomic_t		congested;
 
 	/* The rest of this all shows up in sysfs */
-	unsigned		congested_read_threshold_us;
-	unsigned		congested_write_threshold_us;
+	unsigned int		congested_read_threshold_us;
+	unsigned int		congested_write_threshold_us;
 
 	struct time_stats	btree_gc_time;
 	struct time_stats	btree_split_time;
@@ -694,16 +694,16 @@ struct cache_set {
 		ON_ERROR_PANIC,
 	}			on_error;
 #define DEFAULT_IO_ERROR_LIMIT 8
-	unsigned		error_limit;
-	unsigned		error_decay;
+	unsigned int		error_limit;
+	unsigned int		error_decay;
 
 	unsigned short		journal_delay_ms;
 	bool			expensive_debug_checks;
-	unsigned		verify:1;
-	unsigned		key_merging_disabled:1;
-	unsigned		gc_always_rewrite:1;
-	unsigned		shrinker_disabled:1;
-	unsigned		copy_gc_enabled:1;
+	unsigned int		verify:1;
+	unsigned int		key_merging_disabled:1;
+	unsigned int		gc_always_rewrite:1;
+	unsigned int		shrinker_disabled:1;
+	unsigned int		copy_gc_enabled:1;
 
 #define BUCKET_HASH_BITS	12
 	struct hlist_head	bucket_hash[1 << BUCKET_HASH_BITS];
@@ -712,7 +712,7 @@ struct cache_set {
 };
 
 struct bbio {
-	unsigned		submit_time_us;
+	unsigned int		submit_time_us;
 	union {
 		struct bkey	key;
 		uint64_t	_pad[3];
@@ -729,10 +729,10 @@ struct bbio {
 
 #define btree_bytes(c)		((c)->btree_pages * PAGE_SIZE)
 #define btree_blocks(b)							\
-	((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits))
+	((unsigned int) (KEY_SIZE(&b->key) >> (b)->c->block_bits))
 
 #define btree_default_blocks(c)						\
-	((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
+	((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
 
 #define bucket_pages(c)		((c)->sb.bucket_size / PAGE_SECTORS)
 #define bucket_bytes(c)		((c)->sb.bucket_size << 9)
@@ -761,21 +761,21 @@ static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
 
 static inline struct cache *PTR_CACHE(struct cache_set *c,
 				      const struct bkey *k,
-				      unsigned ptr)
+				      unsigned int ptr)
 {
 	return c->cache[PTR_DEV(k, ptr)];
 }
 
 static inline size_t PTR_BUCKET_NR(struct cache_set *c,
 				   const struct bkey *k,
-				   unsigned ptr)
+				   unsigned int ptr)
 {
 	return sector_to_bucket(c, PTR_OFFSET(k, ptr));
 }
 
 static inline struct bucket *PTR_BUCKET(struct cache_set *c,
 					const struct bkey *k,
-					unsigned ptr)
+					unsigned int ptr)
 {
 	return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
 }
@@ -787,13 +787,13 @@ static inline uint8_t gen_after(uint8_t a, uint8_t b)
 }
 
 static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
-				unsigned i)
+				unsigned int i)
 {
 	return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
 }
 
 static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
-				 unsigned i)
+				 unsigned int i)
 {
 	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
 }
@@ -888,7 +888,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
 static inline void wake_up_allocators(struct cache_set *c)
 {
 	struct cache *ca;
-	unsigned i;
+	unsigned int i;
 
 	for_each_cache(ca, c, i)
 		wake_up_process(ca->alloc_thread);
@@ -933,7 +933,8 @@ void bch_bbio_free(struct bio *, struct cache_set *);
 struct bio *bch_bbio_alloc(struct cache_set *);
 
 void __bch_submit_bbio(struct bio *, struct cache_set *);
-void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
+void bch_submit_bbio(struct bio *, struct cache_set *,
+		     struct bkey *, unsigned int);
 
 uint8_t bch_inc_gen(struct cache *, struct bucket *);
 void bch_rescale_priorities(struct cache_set *, int);
@@ -944,13 +945,13 @@ void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
 void __bch_bucket_free(struct cache *, struct bucket *);
 void bch_bucket_free(struct cache_set *, struct bkey *);
 
-long bch_bucket_alloc(struct cache *, unsigned, bool);
-int __bch_bucket_alloc_set(struct cache_set *, unsigned,
+long bch_bucket_alloc(struct cache *, unsigned int, bool);
+int __bch_bucket_alloc_set(struct cache_set *, unsigned int,
 			   struct bkey *, int, bool);
-int bch_bucket_alloc_set(struct cache_set *, unsigned,
+int bch_bucket_alloc_set(struct cache_set *, unsigned int,
 			 struct bkey *, int, bool);
-bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
-		       unsigned, unsigned, bool);
+bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned int,
+		       unsigned int, unsigned int, bool);
 bool bch_cached_dev_error(struct cached_dev *dc);
 
 __printf(2, 3)
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 596c93b44e9b..dfda7e9efc3e 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -18,7 +18,7 @@
 
 #ifdef CONFIG_BCACHE_DEBUG
 
-void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
+void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set)
 {
 	struct bkey *k, *next;
 
@@ -26,7 +26,7 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
 		next = bkey_next(k);
 
 		printk(KERN_ERR "block %u key %u/%u: ", set,
-		       (unsigned) ((u64 *) k - i->d), i->keys);
+		       (unsigned int) ((u64 *) k - i->d), i->keys);
 
 		if (b->ops->key_dump)
 			b->ops->key_dump(b, k);
@@ -42,7 +42,7 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
 
 void bch_dump_bucket(struct btree_keys *b)
 {
-	unsigned i;
+	unsigned int i;
 
 	console_lock();
 	for (i = 0; i <= b->nsets; i++)
@@ -53,7 +53,7 @@ void bch_dump_bucket(struct btree_keys *b)
 
 int __bch_count_data(struct btree_keys *b)
 {
-	unsigned ret = 0;
+	unsigned int ret = 0;
 	struct btree_iter iter;
 	struct bkey *k;
 
@@ -128,7 +128,7 @@ static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
 
 /* Keylists */
 
-int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
+int __bch_keylist_realloc(struct keylist *l, unsigned int u64s)
 {
 	size_t oldsize = bch_keylist_nkeys(l);
 	size_t newsize = oldsize + u64s;
@@ -180,7 +180,7 @@ void bch_keylist_pop_front(struct keylist *l)
 /* Key/pointer manipulation */
 
 void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
-			      unsigned i)
+			      unsigned int i)
 {
 	BUG_ON(i > KEY_PTRS(src));
 
@@ -194,7 +194,7 @@ void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
 
 bool __bch_cut_front(const struct bkey *where, struct bkey *k)
 {
-	unsigned i, len = 0;
+	unsigned int i, len = 0;
 
 	if (bkey_cmp(where, &START_KEY(k)) <= 0)
 		return false;
@@ -214,7 +214,7 @@ bool __bch_cut_front(const struct bkey *where, struct bkey *k)
 
 bool __bch_cut_back(const struct bkey *where, struct bkey *k)
 {
-	unsigned len = 0;
+	unsigned int len = 0;
 
 	if (bkey_cmp(where, k) >= 0)
 		return false;
@@ -240,9 +240,9 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k)
 #define BKEY_MANTISSA_MASK	((1 << BKEY_MANTISSA_BITS) - 1)
 
 struct bkey_float {
-	unsigned	exponent:BKEY_EXPONENT_BITS;
-	unsigned	m:BKEY_MID_BITS;
-	unsigned	mantissa:BKEY_MANTISSA_BITS;
+	unsigned int	exponent:BKEY_EXPONENT_BITS;
+	unsigned int	m:BKEY_MID_BITS;
+	unsigned int	mantissa:BKEY_MANTISSA_BITS;
 } __packed;
 
 /*
@@ -311,7 +311,7 @@ void bch_btree_keys_free(struct btree_keys *b)
 }
 EXPORT_SYMBOL(bch_btree_keys_free);
 
-int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
+int bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order, gfp_t gfp)
 {
 	struct bset_tree *t = b->set;
 
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(bch_btree_keys_alloc);
 void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
 			 bool *expensive_debug_checks)
 {
-	unsigned i;
+	unsigned int i;
 
 	b->ops = ops;
 	b->expensive_debug_checks = expensive_debug_checks;
@@ -370,7 +370,7 @@ EXPORT_SYMBOL(bch_btree_keys_init);
  * return array index next to j when does in-order traverse
  * of a binary tree which is stored in a linear array
  */
-static unsigned inorder_next(unsigned j, unsigned size)
+static unsigned int inorder_next(unsigned int j, unsigned int size)
 {
 	if (j * 2 + 1 < size) {
 		j = j * 2 + 1;
@@ -387,7 +387,7 @@ static unsigned inorder_next(unsigned j, unsigned size)
  * return array index previous to j when does in-order traverse
  * of a binary tree which is stored in a linear array
  */
-static unsigned inorder_prev(unsigned j, unsigned size)
+static unsigned int inorder_prev(unsigned int j, unsigned int size)
 {
 	if (j * 2 < size) {
 		j = j * 2;
@@ -413,10 +413,12 @@ static unsigned inorder_prev(unsigned j, unsigned size)
  * extra is a function of size:
  *   extra = (size - rounddown_pow_of_two(size - 1)) << 1;
  */
-static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
+static unsigned int __to_inorder(unsigned int j,
+				  unsigned int size,
+				  unsigned int extra)
 {
-	unsigned b = fls(j);
-	unsigned shift = fls(size - 1) - b;
+	unsigned int b = fls(j);
+	unsigned int shift = fls(size - 1) - b;
 
 	j  ^= 1U << (b - 1);
 	j <<= 1;
@@ -433,14 +435,16 @@ static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
  * Return the cacheline index in bset_tree->data, where j is index
  * from a linear array which stores the auxiliar binary tree
  */
-static unsigned to_inorder(unsigned j, struct bset_tree *t)
+static unsigned int to_inorder(unsigned int j, struct bset_tree *t)
 {
 	return __to_inorder(j, t->size, t->extra);
 }
 
-static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
+static unsigned int __inorder_to_tree(unsigned int j,
+				      unsigned int size,
+				      unsigned int extra)
 {
-	unsigned shift;
+	unsigned int shift;
 
 	if (j > extra)
 		j += j - extra;
@@ -457,7 +461,7 @@ static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
  * Return an index from a linear array which stores the auxiliar binary
  * tree, j is the cacheline index of t->data.
  */
-static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
+static unsigned int inorder_to_tree(unsigned int j, struct bset_tree *t)
 {
 	return __inorder_to_tree(j, t->size, t->extra);
 }
@@ -468,11 +472,11 @@ void inorder_test(void)
 	unsigned long done = 0;
 	ktime_t start = ktime_get();
 
-	for (unsigned size = 2;
+	for (unsigned int size = 2;
 	     size < 65536000;
 	     size++) {
-		unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1;
-		unsigned i = 1, j = rounddown_pow_of_two(size - 1);
+		unsigned int extra = (size - rounddown_pow_of_two(size - 1)) << 1;
+		unsigned int i = 1, j = rounddown_pow_of_two(size - 1);
 
 		if (!(size % 4096))
 			printk(KERN_NOTICE "loop %u, %llu per us\n", size,
@@ -518,30 +522,31 @@ void inorder_test(void)
  * of the previous key so we can walk backwards to it from t->tree[j]'s key.
  */
 
-static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline,
-				      unsigned offset)
+static struct bkey *cacheline_to_bkey(struct bset_tree *t,
+				      unsigned int cacheline,
+				      unsigned int offset)
 {
 	return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8;
 }
 
-static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
+static unsigned int bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
 {
 	return ((void *) k - (void *) t->data) / BSET_CACHELINE;
 }
 
-static unsigned bkey_to_cacheline_offset(struct bset_tree *t,
-					 unsigned cacheline,
+static unsigned int bkey_to_cacheline_offset(struct bset_tree *t,
+					 unsigned int cacheline,
 					 struct bkey *k)
 {
 	return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0);
 }
 
-static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
+static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned int j)
 {
 	return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m);
 }
 
-static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j)
+static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned int j)
 {
 	return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]);
 }
@@ -550,7 +555,7 @@ static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j)
  * For the write set - the one we're currently inserting keys into - we don't
  * maintain a full search tree, we just keep a simple lookup table in t->prev.
  */
-static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
+static struct bkey *table_to_bkey(struct bset_tree *t, unsigned int cacheline)
 {
 	return cacheline_to_bkey(t, cacheline, t->prev[cacheline]);
 }
@@ -576,14 +581,14 @@ static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
  * See make_bfloat() to check when most significant bit of f->exponent
  * is set or not.
  */
-static inline unsigned bfloat_mantissa(const struct bkey *k,
+static inline unsigned int bfloat_mantissa(const struct bkey *k,
 				       struct bkey_float *f)
 {
 	const uint64_t *p = &k->low - (f->exponent >> 6);
 	return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK;
 }
 
-static void make_bfloat(struct bset_tree *t, unsigned j)
+static void make_bfloat(struct bset_tree *t, unsigned int j)
 {
 	struct bkey_float *f = &t->tree[j];
 	struct bkey *m = tree_to_bkey(t, j);
@@ -631,7 +636,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
 static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
 {
 	if (t != b->set) {
-		unsigned j = roundup(t[-1].size,
+		unsigned int j = roundup(t[-1].size,
 				     64 / sizeof(struct bkey_float));
 
 		t->tree = t[-1].tree + j;
@@ -686,13 +691,13 @@ void bch_bset_build_written_tree(struct btree_keys *b)
 {
 	struct bset_tree *t = bset_tree_last(b);
 	struct bkey *prev = NULL, *k = t->data->start;
-	unsigned j, cacheline = 1;
+	unsigned int j, cacheline = 1;
 
 	b->last_set_unwritten = 0;
 
 	bset_alloc_tree(b, t);
 
-	t->size = min_t(unsigned,
+	t->size = min_t(unsigned int,
 			bkey_to_cacheline(t, bset_bkey_last(t->data)),
 			b->set->tree + btree_keys_cachelines(b) - t->tree);
 
@@ -732,7 +737,7 @@ EXPORT_SYMBOL(bch_bset_build_written_tree);
 void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k)
 {
 	struct bset_tree *t;
-	unsigned inorder, j = 1;
+	unsigned int inorder, j = 1;
 
 	for (t = b->set; t <= bset_tree_last(b); t++)
 		if (k < bset_bkey_last(t->data))
@@ -779,8 +784,8 @@ static void bch_bset_fix_lookup_table(struct btree_keys *b,
 				      struct bset_tree *t,
 				      struct bkey *k)
 {
-	unsigned shift = bkey_u64s(k);
-	unsigned j = bkey_to_cacheline(t, k);
+	unsigned int shift = bkey_u64s(k);
+	unsigned int j = bkey_to_cacheline(t, k);
 
 	/* We're getting called from btree_split() or btree_gc, just bail out */
 	if (!t->size)
@@ -867,10 +872,10 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where,
 }
 EXPORT_SYMBOL(bch_bset_insert);
 
-unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
+unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
 			      struct bkey *replace_key)
 {
-	unsigned status = BTREE_INSERT_STATUS_NO_INSERT;
+	unsigned int status = BTREE_INSERT_STATUS_NO_INSERT;
 	struct bset *i = bset_tree_last(b)->data;
 	struct bkey *m, *prev = NULL;
 	struct btree_iter iter;
@@ -922,10 +927,10 @@ struct bset_search_iter {
 static struct bset_search_iter bset_search_write_set(struct bset_tree *t,
 						     const struct bkey *search)
 {
-	unsigned li = 0, ri = t->size;
+	unsigned int li = 0, ri = t->size;
 
 	while (li + 1 != ri) {
-		unsigned m = (li + ri) >> 1;
+		unsigned int m = (li + ri) >> 1;
 
 		if (bkey_cmp(table_to_bkey(t, m), search) > 0)
 			ri = m;
@@ -944,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
 {
 	struct bkey *l, *r;
 	struct bkey_float *f;
-	unsigned inorder, j, n = 1;
+	unsigned int inorder, j, n = 1;
 
 	do {
 		/*
@@ -958,7 +963,7 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
 		 *		p = 0;
 		 * but a branch instruction is avoided.
 		 */
-		unsigned p = n << 4;
+		unsigned int p = n << 4;
 		p &= ((int) (p - t->size)) >> 31;
 
 		prefetch(&t->tree[p]);
@@ -978,7 +983,7 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
 		 * to work  - that's done in make_bfloat()
 		 */
 		if (likely(f->exponent != 127))
-			n = j * 2 + (((unsigned)
+			n = j * 2 + (((unsigned int)
 				      (f->mantissa -
 				       bfloat_mantissa(search, f))) >> 31);
 		else
@@ -1184,7 +1189,8 @@ void bch_bset_sort_state_free(struct bset_sort_state *state)
 	mempool_exit(&state->pool);
 }
 
-int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
+int bch_bset_sort_state_init(struct bset_sort_state *state,
+			     unsigned int page_order)
 {
 	spin_lock_init(&state->time.lock);
 
@@ -1237,7 +1243,7 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out,
 }
 
 static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
-			 unsigned start, unsigned order, bool fixup,
+			 unsigned int start, unsigned int order, bool fixup,
 			 struct bset_sort_state *state)
 {
 	uint64_t start_time;
@@ -1288,7 +1294,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
 		bch_time_stats_update(&state->time, start_time);
 }
 
-void bch_btree_sort_partial(struct btree_keys *b, unsigned start,
+void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
 			    struct bset_sort_state *state)
 {
 	size_t order = b->page_order, keys = 0;
@@ -1298,7 +1304,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned start,
 	__bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
 
 	if (start) {
-		unsigned i;
+		unsigned int i;
 
 		for (i = start; i <= b->nsets; i++)
 			keys += b->set[i].data->keys;
@@ -1338,7 +1344,7 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
 
 void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
 {
-	unsigned crit = SORT_CRIT;
+	unsigned int crit = SORT_CRIT;
 	int i;
 
 	/* Don't sort if nothing to do */
@@ -1367,7 +1373,7 @@ EXPORT_SYMBOL(bch_btree_sort_lazy);
 
 void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
 {
-	unsigned i;
+	unsigned int i;
 
 	for (i = 0; i <= b->nsets; i++) {
 		struct bset_tree *t = &b->set[i];
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index b867f2200495..fdc296103113 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -163,10 +163,10 @@ struct bset_tree {
 	 */
 
 	/* size of the binary tree and prev array */
-	unsigned		size;
+	unsigned int		size;
 
 	/* function of size - precalculated for to_inorder() */
-	unsigned		extra;
+	unsigned int		extra;
 
 	/* copy of the last key in the set */
 	struct bkey		end;
@@ -211,7 +211,7 @@ struct btree_keys {
 	const struct btree_keys_ops	*ops;
 	uint8_t			page_order;
 	uint8_t			nsets;
-	unsigned		last_set_unwritten:1;
+	unsigned int		last_set_unwritten:1;
 	bool			*expensive_debug_checks;
 
 	/*
@@ -239,12 +239,12 @@ static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
 	return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
 }
 
-static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
+static inline unsigned int bset_byte_offset(struct btree_keys *b, struct bset *i)
 {
 	return ((size_t) i) - ((size_t) b->set->data);
 }
 
-static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
+static inline unsigned int bset_sector_offset(struct btree_keys *b, struct bset *i)
 {
 	return bset_byte_offset(b, i) >> 9;
 }
@@ -273,7 +273,7 @@ static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
 }
 
 static inline struct bset *bset_next_set(struct btree_keys *b,
-					 unsigned block_bytes)
+					 unsigned int block_bytes)
 {
 	struct bset *i = bset_tree_last(b)->data;
 
@@ -281,7 +281,7 @@ static inline struct bset *bset_next_set(struct btree_keys *b,
 }
 
 void bch_btree_keys_free(struct btree_keys *);
-int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
+int bch_btree_keys_alloc(struct btree_keys *, unsigned int, gfp_t);
 void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
 			 bool *);
 
@@ -290,7 +290,7 @@ void bch_bset_build_written_tree(struct btree_keys *);
 void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
 bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
 void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
-unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
+unsigned int bch_btree_insert_key(struct btree_keys *, struct bkey *,
 			      struct bkey *);
 
 enum {
@@ -349,20 +349,20 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b,
 struct bset_sort_state {
 	mempool_t		pool;
 
-	unsigned		page_order;
-	unsigned		crit_factor;
+	unsigned int		page_order;
+	unsigned int		crit_factor;
 
 	struct time_stats	time;
 };
 
 void bch_bset_sort_state_free(struct bset_sort_state *);
-int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
+int bch_bset_sort_state_init(struct bset_sort_state *, unsigned int);
 void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
 void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
 			 struct bset_sort_state *);
 void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
 				    struct bset_sort_state *);
-void bch_btree_sort_partial(struct btree_keys *, unsigned,
+void bch_btree_sort_partial(struct btree_keys *, unsigned int,
 			    struct bset_sort_state *);
 
 static inline void bch_btree_sort(struct btree_keys *b,
@@ -383,7 +383,7 @@ void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
 
 #define bset_bkey_last(i)	bkey_idx((struct bkey *) (i)->d, (i)->keys)
 
-static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
+static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx)
 {
 	return bkey_idx(i->start, idx);
 }
@@ -402,7 +402,7 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
 }
 
 void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
-			      unsigned);
+			      unsigned int);
 bool __bch_cut_front(const struct bkey *, struct bkey *);
 bool __bch_cut_back(const struct bkey *, struct bkey *);
 
@@ -524,7 +524,7 @@ static inline size_t bch_keylist_bytes(struct keylist *l)
 
 struct bkey *bch_keylist_pop(struct keylist *);
 void bch_keylist_pop_front(struct keylist *);
-int __bch_keylist_realloc(struct keylist *, unsigned);
+int __bch_keylist_realloc(struct keylist *, unsigned int);
 
 /* Debug stuff */
 
@@ -532,7 +532,7 @@ int __bch_keylist_realloc(struct keylist *, unsigned);
 
 int __bch_count_data(struct btree_keys *);
 void __printf(2, 3) __bch_check_keys(struct btree_keys *, const char *, ...);
-void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned int);
 void bch_dump_bucket(struct btree_keys *);
 
 #else
@@ -541,7 +541,7 @@ static inline int __bch_count_data(struct btree_keys *b) { return -1; }
 static inline void __printf(2, 3)
 	__bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
 static inline void bch_dump_bucket(struct btree_keys *b) {}
-void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned int);
 
 #endif
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index c19f7716df88..96c39a8db895 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -183,7 +183,7 @@ static void bch_btree_init_next(struct btree *b)
 
 void bkey_put(struct cache_set *c, struct bkey *k)
 {
-	unsigned i;
+	unsigned int i;
 
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i))
@@ -479,7 +479,7 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent)
 
 void bch_btree_node_write(struct btree *b, struct closure *parent)
 {
-	unsigned nsets = b->keys.nsets;
+	unsigned int nsets = b->keys.nsets;
 
 	lockdep_assert_held(&b->lock);
 
@@ -581,7 +581,7 @@ static void mca_bucket_free(struct btree *b)
 	list_move(&b->list, &b->c->btree_cache_freeable);
 }
 
-static unsigned btree_order(struct bkey *k)
+static unsigned int btree_order(struct bkey *k)
 {
 	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
 }
@@ -589,7 +589,7 @@ static unsigned btree_order(struct bkey *k)
 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
 {
 	if (!bch_btree_keys_alloc(&b->keys,
-				  max_t(unsigned,
+				  max_t(unsigned int,
 					ilog2(b->c->btree_pages),
 					btree_order(k)),
 				  gfp)) {
@@ -620,7 +620,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
 	return b;
 }
 
-static int mca_reap(struct btree *b, unsigned min_order, bool flush)
+static int mca_reap(struct btree *b, unsigned int min_order, bool flush)
 {
 	struct closure cl;
 
@@ -786,7 +786,7 @@ void bch_btree_cache_free(struct cache_set *c)
 
 int bch_btree_cache_alloc(struct cache_set *c)
 {
-	unsigned i;
+	unsigned int i;
 
 	for (i = 0; i < mca_reserve(c); i++)
 		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
@@ -1136,7 +1136,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b,
 
 static void make_btree_freeing_key(struct btree *b, struct bkey *k)
 {
-	unsigned i;
+	unsigned int i;
 
 	mutex_lock(&b->c->bucket_lock);
 
@@ -1157,7 +1157,7 @@ static int btree_check_reserve(struct btree *b, struct btree_op *op)
 {
 	struct cache_set *c = b->c;
 	struct cache *ca;
-	unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
+	unsigned int i, reserve = (c->root->level - b->level) * 2 + 1;
 
 	mutex_lock(&c->bucket_lock);
 
@@ -1181,7 +1181,7 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
 				    struct bkey *k)
 {
 	uint8_t stale = 0;
-	unsigned i;
+	unsigned int i;
 	struct bucket *g;
 
 	/*
@@ -1219,7 +1219,7 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
 			SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
 
 		/* guard against overflow */
-		SET_GC_SECTORS_USED(g, min_t(unsigned,
+		SET_GC_SECTORS_USED(g, min_t(unsigned int,
 					     GC_SECTORS_USED(g) + KEY_SIZE(k),
 					     MAX_GC_SECTORS_USED));
 
@@ -1233,7 +1233,7 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
 
 void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
 {
-	unsigned i;
+	unsigned int i;
 
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i) &&
@@ -1259,7 +1259,7 @@ void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
 static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
 {
 	uint8_t stale = 0;
-	unsigned keys = 0, good_keys = 0;
+	unsigned int keys = 0, good_keys = 0;
 	struct bkey *k;
 	struct btree_iter iter;
 	struct bset_tree *t;
@@ -1302,7 +1302,7 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
 
 struct gc_merge_info {
 	struct btree	*b;
-	unsigned	keys;
+	unsigned int	keys;
 };
 
 static int bch_btree_insert_node(struct btree *, struct btree_op *,
@@ -1311,7 +1311,7 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *,
 static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 			     struct gc_stat *gc, struct gc_merge_info *r)
 {
-	unsigned i, nodes = 0, keys = 0, blocks;
+	unsigned int i, nodes = 0, keys = 0, blocks;
 	struct btree *new_nodes[GC_MERGE_NODES];
 	struct keylist keylist;
 	struct closure cl;
@@ -1511,11 +1511,11 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
 	return -EINTR;
 }
 
-static unsigned btree_gc_count_keys(struct btree *b)
+static unsigned int btree_gc_count_keys(struct btree *b)
 {
 	struct bkey *k;
 	struct btree_iter iter;
-	unsigned ret = 0;
+	unsigned int ret = 0;
 
 	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
 		ret += bkey_u64s(k);
@@ -1678,7 +1678,7 @@ static void btree_gc_start(struct cache_set *c)
 {
 	struct cache *ca;
 	struct bucket *b;
-	unsigned i;
+	unsigned int i;
 
 	if (!c->gc_mark_valid)
 		return;
@@ -1704,7 +1704,7 @@ static void bch_btree_gc_finish(struct cache_set *c)
 {
 	struct bucket *b;
 	struct cache *ca;
-	unsigned i;
+	unsigned int i;
 
 	mutex_lock(&c->bucket_lock);
 
@@ -1722,7 +1722,7 @@ static void bch_btree_gc_finish(struct cache_set *c)
 		struct bcache_device *d = c->devices[i];
 		struct cached_dev *dc;
 		struct keybuf_key *w, *n;
-		unsigned j;
+		unsigned int j;
 
 		if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
 			continue;
@@ -1814,7 +1814,7 @@ static void bch_btree_gc(struct cache_set *c)
 static bool gc_should_run(struct cache_set *c)
 {
 	struct cache *ca;
-	unsigned i;
+	unsigned int i;
 
 	for_each_cache(ca, c, i)
 		if (ca->invalidate_needs_gc)
@@ -1905,7 +1905,7 @@ void bch_initial_gc_finish(struct cache_set *c)
 {
 	struct cache *ca;
 	struct bucket *b;
-	unsigned i;
+	unsigned int i;
 
 	bch_btree_gc_finish(c);
 
@@ -1945,7 +1945,7 @@ void bch_initial_gc_finish(struct cache_set *c)
 static bool btree_insert_key(struct btree *b, struct bkey *k,
 			     struct bkey *replace_key)
 {
-	unsigned status;
+	unsigned int status;
 
 	BUG_ON(bkey_cmp(k, &b->key) > 0);
 
@@ -2044,7 +2044,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 			   block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
 
 	if (split) {
-		unsigned keys = 0;
+		unsigned int keys = 0;
 
 		trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
 
@@ -2300,7 +2300,7 @@ int bch_btree_insert(struct cache_set *c, struct keylist *keys,
 
 void bch_btree_set_root(struct btree *b)
 {
-	unsigned i;
+	unsigned int i;
 	struct closure cl;
 
 	closure_init_stack(&cl);
@@ -2412,7 +2412,7 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
 
 struct refill {
 	struct btree_op	op;
-	unsigned	nr_found;
+	unsigned int	nr_found;
 	struct keybuf	*buf;
 	struct bkey	*end;
 	keybuf_pred_fn	*pred;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 68e9d926134d..8cbc0fd27738 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -184,7 +184,7 @@ static inline struct bset *btree_bset_last(struct btree *b)
 	return bset_tree_last(&b->keys)->data;
 }
 
-static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
+static inline unsigned int bset_block_offset(struct btree *b, struct bset *i)
 {
 	return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
 }
@@ -213,7 +213,7 @@ struct btree_op {
 	/* Btree level at which we start taking write locks */
 	short			lock;
 
-	unsigned		insert_collision:1;
+	unsigned int		insert_collision:1;
 };
 
 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 7c2c5bc7c88b..7f3594c0be14 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -159,7 +159,7 @@ struct closure {
 #define CLOSURE_MAGIC_DEAD	0xc054dead
 #define CLOSURE_MAGIC_ALIVE	0xc054a11e
 
-	unsigned		magic;
+	unsigned int		magic;
 	struct list_head	all;
 	unsigned long		ip;
 	unsigned long		waiting_on;
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 12034c07257b..0caad145902b 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -69,7 +69,7 @@ void bch_btree_verify(struct btree *b)
 		   sorted->start,
 		   (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
 		struct bset *i;
-		unsigned j;
+		unsigned int j;
 
 		console_lock();
 
@@ -80,7 +80,7 @@ void bch_btree_verify(struct btree *b)
 		bch_dump_bset(&v->keys, sorted, 0);
 
 		for_each_written_bset(b, ondisk, i) {
-			unsigned block = ((void *) i - (void *) ondisk) /
+			unsigned int block = ((void *) i - (void *) ondisk) /
 				block_bytes(b->c);
 
 			printk(KERN_ERR "*** on disk block %u:\n", block);
@@ -176,7 +176,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
 
 	while (size) {
 		struct keybuf_key *w;
-		unsigned bytes = min(i->bytes, size);
+		unsigned int bytes = min(i->bytes, size);
 
 		int err = copy_to_user(buf, i->buf, bytes);
 		if (err)
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 1d096742eb41..e96ba928eeb6 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -46,7 +46,7 @@ static bool bch_key_sort_cmp(struct btree_iter_set l,
 
 static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
 {
-	unsigned i;
+	unsigned int i;
 
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i)) {
@@ -67,7 +67,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
 
 static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
 {
-	unsigned i;
+	unsigned int i;
 
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i)) {
@@ -96,7 +96,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
 
 void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
 {
-	unsigned i = 0;
+	unsigned int i = 0;
 	char *out = buf, *end = buf + size;
 
 #define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
@@ -126,7 +126,7 @@ void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
 static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
 {
 	struct btree *b = container_of(keys, struct btree, keys);
-	unsigned j;
+	unsigned int j;
 	char buf[80];
 
 	bch_extent_to_text(buf, sizeof(buf), k);
@@ -171,7 +171,7 @@ static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
 
 static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
 {
-	unsigned i;
+	unsigned int i;
 	char buf[80];
 	struct bucket *g;
 
@@ -204,7 +204,7 @@ err:
 static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
 {
 	struct btree *b = container_of(bk, struct btree, keys);
-	unsigned i;
+	unsigned int i;
 
 	if (!bkey_cmp(k, &ZERO_KEY) ||
 	    !KEY_PTRS(k) ||
@@ -327,7 +327,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
 	struct cache_set *c = container_of(b, struct btree, keys)->c;
 
 	uint64_t old_offset;
-	unsigned old_size, sectors_found = 0;
+	unsigned int old_size, sectors_found = 0;
 
 	BUG_ON(!KEY_OFFSET(insert));
 	BUG_ON(!KEY_SIZE(insert));
@@ -363,7 +363,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
 			 * k might have been split since we inserted/found the
 			 * key we're replacing
 			 */
-			unsigned i;
+			unsigned int i;
 			uint64_t offset = KEY_START(k) -
 				KEY_START(replace_key);
 
@@ -502,7 +502,7 @@ static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
 }
 
 static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
-				     unsigned ptr)
+				     unsigned int ptr)
 {
 	struct bucket *g = PTR_BUCKET(b->c, k, ptr);
 	char buf[80];
@@ -534,7 +534,7 @@ err:
 static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
 {
 	struct btree *b = container_of(bk, struct btree, keys);
-	unsigned i, stale;
+	unsigned int i, stale;
 
 	if (!KEY_PTRS(k) ||
 	    bch_extent_invalid(bk, k))
@@ -577,7 +577,7 @@ static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
 static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
 {
 	struct btree *b = container_of(bk, struct btree, keys);
-	unsigned i;
+	unsigned int i;
 
 	if (key_merging_disabled(b->c))
 		return false;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 9612873afee2..c6b41a09f550 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -42,7 +42,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
 }
 
 void bch_submit_bbio(struct bio *bio, struct cache_set *c,
-		     struct bkey *k, unsigned ptr)
+		     struct bkey *k, unsigned int ptr)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	bch_bkey_copy_single_ptr(&b->key, k, ptr);
@@ -52,7 +52,7 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
 /* IO errors */
 void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
 {
-	unsigned errors;
+	unsigned int errors;
 
 	WARN_ONCE(!dc, "NULL pointer of struct cached_dev");
 
@@ -75,12 +75,12 @@ void bch_count_io_errors(struct cache *ca,
 	 */
 
 	if (ca->set->error_decay) {
-		unsigned count = atomic_inc_return(&ca->io_count);
+		unsigned int count = atomic_inc_return(&ca->io_count);
 
 		while (count > ca->set->error_decay) {
-			unsigned errors;
-			unsigned old = count;
-			unsigned new = count - ca->set->error_decay;
+			unsigned int errors;
+			unsigned int old = count;
+			unsigned int new = count - ca->set->error_decay;
 
 			/*
 			 * First we subtract refresh from count; each time we
@@ -104,7 +104,7 @@ void bch_count_io_errors(struct cache *ca,
 	}
 
 	if (error) {
-		unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
+		unsigned int errors = atomic_add_return(1 << IO_ERROR_SHIFT,
 						    &ca->io_errors);
 		errors >>= IO_ERROR_SHIFT;
 
@@ -126,12 +126,12 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
 	struct cache *ca = PTR_CACHE(c, &b->key, 0);
 	int is_read = (bio_data_dir(bio) == READ ? 1 : 0);
 
-	unsigned threshold = op_is_write(bio_op(bio))
+	unsigned int threshold = op_is_write(bio_op(bio))
 		? c->congested_write_threshold_us
 		: c->congested_read_threshold_us;
 
 	if (threshold) {
-		unsigned t = local_clock_us();
+		unsigned int t = local_clock_us();
 
 		int us = t - b->submit_time_us;
 		int congested = atomic_read(&c->congested);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 10748c626a1d..ee61062b58fc 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -32,7 +32,7 @@ static void journal_read_endio(struct bio *bio)
 }
 
 static int journal_read_bucket(struct cache *ca, struct list_head *list,
-			       unsigned bucket_index)
+			       unsigned int bucket_index)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bio *bio = &ja->bio;
@@ -40,7 +40,7 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
 	struct journal_replay *i;
 	struct jset *j, *data = ca->set->journal.w[0].data;
 	struct closure cl;
-	unsigned len, left, offset = 0;
+	unsigned int len, left, offset = 0;
 	int ret = 0;
 	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
 
@@ -50,7 +50,7 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
 
 	while (offset < ca->sb.bucket_size) {
 reread:		left = ca->sb.bucket_size - offset;
-		len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);
+		len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS);
 
 		bio_reset(bio);
 		bio->bi_iter.bi_sector	= bucket + offset;
@@ -154,12 +154,12 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
 	})
 
 	struct cache *ca;
-	unsigned iter;
+	unsigned int iter;
 
 	for_each_cache(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
 		DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS);
-		unsigned i, l, r, m;
+		unsigned int i, l, r, m;
 		uint64_t seq;
 
 		bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
@@ -304,7 +304,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
 		     k < bset_bkey_last(&i->j);
 		     k = bkey_next(k))
 			if (!__bch_extent_invalid(c, k)) {
-				unsigned j;
+				unsigned int j;
 
 				for (j = 0; j < KEY_PTRS(k); j++)
 					if (ptr_available(c, k, j))
@@ -492,7 +492,7 @@ static void journal_reclaim(struct cache_set *c)
 	struct bkey *k = &c->journal.key;
 	struct cache *ca;
 	uint64_t last_seq;
-	unsigned iter, n = 0;
+	unsigned int iter, n = 0;
 	atomic_t p __maybe_unused;
 
 	atomic_long_inc(&c->reclaim);
@@ -526,7 +526,7 @@ static void journal_reclaim(struct cache_set *c)
 
 	for_each_cache(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
-		unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+		unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
 
 		/* No space available on this device */
 		if (next == ja->discard_idx)
@@ -609,7 +609,7 @@ static void journal_write_unlocked(struct closure *cl)
 	struct cache *ca;
 	struct journal_write *w = c->journal.cur;
 	struct bkey *k = &c->journal.key;
-	unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
+	unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) *
 		c->sb.block_size;
 
 	struct bio *bio;
@@ -705,7 +705,7 @@ static void journal_try_write(struct cache_set *c)
 }
 
 static struct journal_write *journal_wait_for_write(struct cache_set *c,
-						    unsigned nkeys)
+						    unsigned int nkeys)
 	__acquires(&c->journal.lock)
 {
 	size_t sectors;
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index b5788199188f..f0982731ae20 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -110,7 +110,7 @@ struct journal {
 	struct delayed_work	work;
 
 	/* Number of blocks free in the bucket(s) we're currently writing to */
-	unsigned		blocks_free;
+	unsigned int		blocks_free;
 	uint64_t		seq;
 	DECLARE_FIFO(atomic_t, pin);
 
@@ -131,13 +131,13 @@ struct journal_device {
 	uint64_t		seq[SB_JOURNAL_BUCKETS];
 
 	/* Journal bucket we're currently writing to */
-	unsigned		cur_idx;
+	unsigned int		cur_idx;
 
 	/* Last journal bucket that still contains an open journal entry */
-	unsigned		last_idx;
+	unsigned int		last_idx;
 
 	/* Next journal bucket to be discarded */
-	unsigned		discard_idx;
+	unsigned int		discard_idx;
 
 #define DISCARD_READY		0
 #define DISCARD_IN_FLIGHT	1
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index a24c3a95b2c0..0790d710f911 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -23,7 +23,7 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
 {
 	struct cache_set *c = container_of(buf, struct cache_set,
 					   moving_gc_keys);
-	unsigned i;
+	unsigned int i;
 
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i) &&
@@ -186,7 +186,7 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r)
 	return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
 }
 
-static unsigned bucket_heap_top(struct cache *ca)
+static unsigned int bucket_heap_top(struct cache *ca)
 {
 	struct bucket *b;
 	return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
@@ -196,7 +196,7 @@ void bch_moving_gc(struct cache_set *c)
 {
 	struct cache *ca;
 	struct bucket *b;
-	unsigned i;
+	unsigned int i;
 
 	if (!c->copy_gc_enabled)
 		return;
@@ -204,9 +204,9 @@ void bch_moving_gc(struct cache_set *c)
 	mutex_lock(&c->bucket_lock);
 
 	for_each_cache(ca, c, i) {
-		unsigned sectors_to_move = 0;
-		unsigned reserve_sectors = ca->sb.bucket_size *
-			fifo_used(&ca->free[RESERVE_MOVINGGC]);
+		unsigned int sectors_to_move = 0;
+		unsigned int reserve_sectors = ca->sb.bucket_size *
+			     fifo_used(&ca->free[RESERVE_MOVINGGC]);
 
 		ca->heap.used = 0;
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 7dbe8b6316a0..6e1a60dd1742 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -27,7 +27,7 @@ struct kmem_cache *bch_search_cache;
 
 static void bch_data_insert_start(struct closure *);
 
-static unsigned cache_mode(struct cached_dev *dc)
+static unsigned int cache_mode(struct cached_dev *dc)
 {
 	return BDEV_CACHE_MODE(&dc->sb);
 }
@@ -98,7 +98,7 @@ static void bch_data_insert_keys(struct closure *cl)
 	closure_return(cl);
 }
 
-static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
+static int bch_keylist_realloc(struct keylist *l, unsigned int u64s,
 			       struct cache_set *c)
 {
 	size_t oldsize = bch_keylist_nkeys(l);
@@ -125,7 +125,7 @@ static void bch_data_invalidate(struct closure *cl)
 		 bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
 
 	while (bio_sectors(bio)) {
-		unsigned sectors = min(bio_sectors(bio),
+		unsigned int sectors = min(bio_sectors(bio),
 				       1U << (KEY_SIZE_BITS - 1));
 
 		if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
@@ -211,7 +211,7 @@ static void bch_data_insert_start(struct closure *cl)
 	bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
 
 	do {
-		unsigned i;
+		unsigned int i;
 		struct bkey *k;
 		struct bio_set *split = &op->c->bio_split;
 
@@ -328,7 +328,7 @@ void bch_data_insert(struct closure *cl)
 
 /* Congested? */
 
-unsigned bch_get_congested(struct cache_set *c)
+unsigned int bch_get_congested(struct cache_set *c)
 {
 	int i;
 	long rand;
@@ -372,8 +372,8 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
 static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 {
 	struct cache_set *c = dc->disk.c;
-	unsigned mode = cache_mode(dc);
-	unsigned sectors, congested = bch_get_congested(c);
+	unsigned int mode = cache_mode(dc);
+	unsigned int sectors, congested = bch_get_congested(c);
 	struct task_struct *task = current;
 	struct io *i;
 
@@ -469,11 +469,11 @@ struct search {
 	struct bio		*cache_miss;
 	struct bcache_device	*d;
 
-	unsigned		insert_bio_sectors;
-	unsigned		recoverable:1;
-	unsigned		write:1;
-	unsigned		read_dirty_data:1;
-	unsigned		cache_missed:1;
+	unsigned int		insert_bio_sectors;
+	unsigned int		recoverable:1;
+	unsigned int		write:1;
+	unsigned int		read_dirty_data:1;
+	unsigned int		cache_missed:1;
 
 	unsigned long		start_time;
 
@@ -514,15 +514,15 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	struct search *s = container_of(op, struct search, op);
 	struct bio *n, *bio = &s->bio.bio;
 	struct bkey *bio_key;
-	unsigned ptr;
+	unsigned int ptr;
 
 	if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
 		return MAP_CONTINUE;
 
 	if (KEY_INODE(k) != s->iop.inode ||
 	    KEY_START(k) > bio->bi_iter.bi_sector) {
-		unsigned bio_sectors = bio_sectors(bio);
-		unsigned sectors = KEY_INODE(k) == s->iop.inode
+		unsigned int bio_sectors = bio_sectors(bio);
+		unsigned int sectors = KEY_INODE(k) == s->iop.inode
 			? min_t(uint64_t, INT_MAX,
 				KEY_START(k) - bio->bi_iter.bi_sector)
 			: INT_MAX;
@@ -856,10 +856,10 @@ static void cached_dev_read_done_bh(struct closure *cl)
 }
 
 static int cached_dev_cache_miss(struct btree *b, struct search *s,
-				 struct bio *bio, unsigned sectors)
+				 struct bio *bio, unsigned int sectors)
 {
 	int ret = MAP_CONTINUE;
-	unsigned reada = 0;
+	unsigned int reada = 0;
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss, *cache_bio;
 
@@ -1226,7 +1226,7 @@ static int cached_dev_congested(void *data, int bits)
 		return 1;
 
 	if (cached_dev_get(dc)) {
-		unsigned i;
+		unsigned int i;
 		struct cache *ca;
 
 		for_each_cache(ca, d->c, i) {
@@ -1253,9 +1253,9 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
 /* Flash backed devices */
 
 static int flash_dev_cache_miss(struct btree *b, struct search *s,
-				struct bio *bio, unsigned sectors)
+				struct bio *bio, unsigned int sectors)
 {
-	unsigned bytes = min(sectors, bio_sectors(bio)) << 9;
+	unsigned int bytes = min(sectors, bio_sectors(bio)) << 9;
 
 	swap(bio->bi_iter.bi_size, bytes);
 	zero_fill_bio(bio);
@@ -1338,7 +1338,7 @@ static int flash_dev_congested(void *data, int bits)
 	struct bcache_device *d = data;
 	struct request_queue *q;
 	struct cache *ca;
-	unsigned i;
+	unsigned int i;
 	int ret = 0;
 
 	for_each_cache(ca, d->c, i) {
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index dea0886b81c1..8e8c1ce00d9d 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -8,7 +8,7 @@ struct data_insert_op {
 	struct bio		*bio;
 	struct workqueue_struct *wq;
 
-	unsigned		inode;
+	unsigned int		inode;
 	uint16_t		write_point;
 	uint16_t		write_prio;
 	blk_status_t		status;
@@ -17,15 +17,15 @@ struct data_insert_op {
 		uint16_t	flags;
 
 	struct {
-		unsigned	bypass:1;
-		unsigned	writeback:1;
-		unsigned	flush_journal:1;
-		unsigned	csum:1;
+		unsigned int	bypass:1;
+		unsigned int	writeback:1;
+		unsigned int	flush_journal:1;
+		unsigned int	csum:1;
 
-		unsigned	replace:1;
-		unsigned	replace_collision:1;
+		unsigned int	replace:1;
+		unsigned int	replace_collision:1;
 
-		unsigned	insert_data_done:1;
+		unsigned int	insert_data_done:1;
 	};
 	};
 
@@ -33,7 +33,7 @@ struct data_insert_op {
 	BKEY_PADDED(replace_key);
 };
 
-unsigned bch_get_congested(struct cache_set *);
+unsigned int bch_get_congested(struct cache_set *);
 void bch_data_insert(struct closure *cl);
 
 void bch_cached_dev_request_init(struct cached_dev *dc);
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index be119326297b..2331a0d5aa28 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -33,11 +33,11 @@
  * stored left shifted by 16, and scaled back in the sysfs show() function.
  */
 
-static const unsigned DAY_RESCALE		= 288;
-static const unsigned HOUR_RESCALE		= 12;
-static const unsigned FIVE_MINUTE_RESCALE	= 1;
-static const unsigned accounting_delay		= (HZ * 300) / 22;
-static const unsigned accounting_weight		= 32;
+static const unsigned int DAY_RESCALE		= 288;
+static const unsigned int HOUR_RESCALE		= 12;
+static const unsigned int FIVE_MINUTE_RESCALE	= 1;
+static const unsigned int accounting_delay	= (HZ * 300) / 22;
+static const unsigned int accounting_weight	= 32;
 
 /* sysfs reading/writing */
 
@@ -152,7 +152,7 @@ static void scale_accounting(struct timer_list *t)
 	struct cache_accounting *acc = from_timer(acc, t, timer);
 
 #define move_stat(name) do {						\
-	unsigned t = atomic_xchg(&acc->collector.name, 0);		\
+	unsigned int t = atomic_xchg(&acc->collector.name, 0);		\
 	t <<= 16;							\
 	acc->five_minute.name += t;					\
 	acc->hour.name += t;						\
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
index 0b70f9de0c03..77234a89dd69 100644
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@@ -23,7 +23,7 @@ struct cache_stats {
 	unsigned long cache_miss_collisions;
 	unsigned long sectors_bypassed;
 
-	unsigned		rescale;
+	unsigned int		rescale;
 };
 
 struct cache_accounting {
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 55a37641aa95..4ab1b1968d9a 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -61,7 +61,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 	const char *err;
 	struct cache_sb *s;
 	struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
-	unsigned i;
+	unsigned int i;
 
 	if (!bh)
 		return "IO error";
@@ -202,7 +202,7 @@ static void write_bdev_super_endio(struct bio *bio)
 static void __write_super(struct cache_sb *sb, struct bio *bio)
 {
 	struct cache_sb *out = page_address(bio_first_page_all(bio));
-	unsigned i;
+	unsigned int i;
 
 	bio->bi_iter.bi_sector	= SB_SECTOR;
 	bio->bi_iter.bi_size	= SB_SIZE;
@@ -282,7 +282,7 @@ void bcache_write_super(struct cache_set *c)
 {
 	struct closure *cl = &c->sb_write;
 	struct cache *ca;
-	unsigned i;
+	unsigned int i;
 
 	down(&c->sb_write_mutex);
 	closure_init(cl, &c->cl);
@@ -334,7 +334,7 @@ static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
 {
 	struct closure *cl = &c->uuid_write;
 	struct uuid_entry *u;
-	unsigned i;
+	unsigned int i;
 	char buf[80];
 
 	BUG_ON(!parent);
@@ -587,7 +587,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
 	struct prio_set *p = ca->disk_buckets;
 	struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
 	struct bucket *b;
-	unsigned bucket_nr = 0;
+	unsigned int bucket_nr = 0;
 
 	for (b = ca->buckets;
 	     b < ca->buckets + ca->sb.nbuckets;
@@ -662,7 +662,7 @@ static void bcache_device_unlink(struct bcache_device *d)
 	lockdep_assert_held(&bch_register_lock);
 
 	if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
-		unsigned i;
+		unsigned int i;
 		struct cache *ca;
 
 		sysfs_remove_link(&d->c->kobj, d->name);
@@ -676,7 +676,7 @@ static void bcache_device_unlink(struct bcache_device *d)
 static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
 			       const char *name)
 {
-	unsigned i;
+	unsigned int i;
 	struct cache *ca;
 
 	for_each_cache(ca, d->c, i)
@@ -715,7 +715,7 @@ static void bcache_device_detach(struct bcache_device *d)
 }
 
 static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
-				 unsigned id)
+				 unsigned int id)
 {
 	d->id = id;
 	d->c = c;
@@ -762,7 +762,7 @@ static void bcache_device_free(struct bcache_device *d)
 	closure_debug_destroy(&d->cl);
 }
 
-static int bcache_device_init(struct bcache_device *d, unsigned block_size,
+static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
 			      sector_t sectors)
 {
 	struct request_queue *q;
@@ -778,7 +778,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 
 	if (!d->nr_stripes || d->nr_stripes > max_stripes) {
 		pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
-			(unsigned)d->nr_stripes);
+			(unsigned int)d->nr_stripes);
 		return -ENOMEM;
 	}
 
@@ -1212,7 +1212,7 @@ static void cached_dev_flush(struct closure *cl)
 	continue_at(cl, cached_dev_free, system_wq);
 }
 
-static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
+static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
 {
 	int ret;
 	struct io *io;
@@ -1489,7 +1489,7 @@ static void cache_set_free(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, cl);
 	struct cache *ca;
-	unsigned i;
+	unsigned int i;
 
 	if (!IS_ERR_OR_NULL(c->debug))
 		debugfs_remove(c->debug);
@@ -1532,7 +1532,7 @@ static void cache_set_flush(struct closure *cl)
 	struct cache_set *c = container_of(cl, struct cache_set, caching);
 	struct cache *ca;
 	struct btree *b;
-	unsigned i;
+	unsigned int i;
 
 	bch_cache_accounting_destroy(&c->accounting);
 
@@ -1762,7 +1762,7 @@ static void run_cache_set(struct cache_set *c)
 	struct cached_dev *dc, *t;
 	struct cache *ca;
 	struct closure cl;
-	unsigned i;
+	unsigned int i;
 
 	closure_init_stack(&cl);
 
@@ -1853,7 +1853,7 @@ static void run_cache_set(struct cache_set *c)
 		pr_notice("invalidating existing data");
 
 		for_each_cache(ca, c, i) {
-			unsigned j;
+			unsigned int j;
 
 			ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
 					      2, SB_JOURNAL_BUCKETS);
@@ -1998,7 +1998,7 @@ err:
 void bch_cache_release(struct kobject *kobj)
 {
 	struct cache *ca = container_of(kobj, struct cache, kobj);
-	unsigned i;
+	unsigned int i;
 
 	if (ca->set) {
 		BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
@@ -2150,7 +2150,7 @@ static bool bch_is_open_backing(struct block_device *bdev) {
 static bool bch_is_open_cache(struct block_device *bdev) {
 	struct cache_set *c, *tc;
 	struct cache *ca;
-	unsigned i;
+	unsigned int i;
 
 	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
 		for_each_cache(ca, c, i)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 81d3520b0702..3f2b7964d6a9 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -307,7 +307,7 @@ STORE(__cached_dev)
 		if (v < 0)
 			return v;
 
-		if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) {
+		if ((unsigned int) v != BDEV_CACHE_MODE(&dc->sb)) {
 			SET_BDEV_CACHE_MODE(&dc->sb, v);
 			bch_write_bdev_super(dc, NULL);
 		}
@@ -533,9 +533,9 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf)
 			op.stats.floats, op.stats.failed);
 }
 
-static unsigned bch_root_usage(struct cache_set *c)
+static unsigned int bch_root_usage(struct cache_set *c)
 {
-	unsigned bytes = 0;
+	unsigned int bytes = 0;
 	struct bkey *k;
 	struct btree *b;
 	struct btree_iter iter;
@@ -570,9 +570,9 @@ static size_t bch_cache_size(struct cache_set *c)
 	return ret;
 }
 
-static unsigned bch_cache_max_chain(struct cache_set *c)
+static unsigned int bch_cache_max_chain(struct cache_set *c)
 {
-	unsigned ret = 0;
+	unsigned int ret = 0;
 	struct hlist_head *h;
 
 	mutex_lock(&c->bucket_lock);
@@ -580,7 +580,7 @@ static unsigned bch_cache_max_chain(struct cache_set *c)
 	for (h = c->bucket_hash;
 	     h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
 	     h++) {
-		unsigned i = 0;
+		unsigned int i = 0;
 		struct hlist_node *p;
 
 		hlist_for_each(p, h)
@@ -593,13 +593,13 @@ static unsigned bch_cache_max_chain(struct cache_set *c)
 	return ret;
 }
 
-static unsigned bch_btree_used(struct cache_set *c)
+static unsigned int bch_btree_used(struct cache_set *c)
 {
 	return div64_u64(c->gc_stats.key_bytes * 100,
 			 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
 }
 
-static unsigned bch_average_key_size(struct cache_set *c)
+static unsigned int bch_average_key_size(struct cache_set *c)
 {
 	return c->gc_stats.nkeys
 		? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
@@ -996,7 +996,7 @@ STORE(__bch_cache)
 		if (v < 0)
 			return v;
 
-		if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) {
+		if ((unsigned int) v != CACHE_REPLACEMENT(&ca->sb)) {
 			mutex_lock(&ca->set->bucket_lock);
 			SET_CACHE_REPLACEMENT(&ca->sb, v);
 			mutex_unlock(&ca->set->bucket_lock);
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index f7b0133c9d2f..484044231f21 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -347,7 +347,7 @@ static inline int bch_strtoul_h(const char *cp, long *res)
 	snprintf(buf, size,						\
 		__builtin_types_compatible_p(typeof(var), int)		\
 		     ? "%i\n" :						\
-		__builtin_types_compatible_p(typeof(var), unsigned)	\
+		__builtin_types_compatible_p(typeof(var), unsigned int)	\
 		     ? "%u\n" :						\
 		__builtin_types_compatible_p(typeof(var), long)		\
 		     ? "%li\n" :					\
@@ -379,7 +379,7 @@ struct time_stats {
 
 void bch_time_stats_update(struct time_stats *stats, uint64_t time);
 
-static inline unsigned local_clock_us(void)
+static inline unsigned int local_clock_us(void)
 {
 	return local_clock() >> 10;
 }
@@ -543,9 +543,10 @@ dup:									\
 	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
 
 /* Does linear interpolation between powers of two */
-static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+static inline unsigned int fract_exp_two(unsigned int x,
+					 unsigned int fract_bits)
 {
-	unsigned fract = x & ~(~0 << fract_bits);
+	unsigned int fract = x & ~(~0 << fract_bits);
 
 	x >>= fract_bits;
 	x   = 1 << x;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 481d4cf38ac0..39ee38ffb2db 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -215,7 +215,8 @@ static void update_writeback_rate(struct work_struct *work)
 	smp_mb();
 }
 
-static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
+static unsigned int writeback_delay(struct cached_dev *dc,
+				    unsigned int sectors)
 {
 	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    !dc->writeback_percent)
@@ -263,7 +264,7 @@ static void write_dirty_finish(struct closure *cl)
 	/* This is kind of a dumb way of signalling errors. */
 	if (KEY_DIRTY(&w->key)) {
 		int ret;
-		unsigned i;
+		unsigned int i;
 		struct keylist keys;
 
 		bch_keylist_init(&keys);
@@ -377,7 +378,7 @@ static void read_dirty_submit(struct closure *cl)
 
 static void read_dirty(struct cached_dev *dc)
 {
-	unsigned delay = 0;
+	unsigned int delay = 0;
 	struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
 	size_t size;
 	int nk, i;
@@ -498,11 +499,11 @@ err:
 
 /* Scan for dirty data */
 
-void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
+void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode,
 				  uint64_t offset, int nr_sectors)
 {
 	struct bcache_device *d = c->devices[inode];
-	unsigned stripe_offset, stripe, sectors_dirty;
+	unsigned int stripe_offset, stripe, sectors_dirty;
 
 	if (!d)
 		return;
@@ -514,7 +515,7 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 	stripe_offset = offset & (d->stripe_size - 1);
 
 	while (nr_sectors) {
-		int s = min_t(unsigned, abs(nr_sectors),
+		int s = min_t(unsigned int, abs(nr_sectors),
 			      d->stripe_size - stripe_offset);
 
 		if (nr_sectors < 0)
@@ -548,7 +549,7 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 static void refill_full_stripes(struct cached_dev *dc)
 {
 	struct keybuf *buf = &dc->writeback_keys;
-	unsigned start_stripe, stripe, next_stripe;
+	unsigned int start_stripe, stripe, next_stripe;
 	bool wrapped = false;
 
 	stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
@@ -688,7 +689,7 @@ static int bch_writeback_thread(void *arg)
 		read_dirty(dc);
 
 		if (searched_full_index) {
-			unsigned delay = dc->writeback_delay * HZ;
+			unsigned int delay = dc->writeback_delay * HZ;
 
 			while (delay &&
 			       !kthread_should_stop() &&
@@ -712,7 +713,7 @@ static int bch_writeback_thread(void *arg)
 
 struct sectors_dirty_init {
 	struct btree_op	op;
-	unsigned	inode;
+	unsigned int	inode;
 	size_t		count;
 	struct bkey	start;
 };
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 3745d7004c47..76b691850c98 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -28,7 +28,7 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
 	return ret;
 }
 
-static inline unsigned offset_to_stripe(struct bcache_device *d,
+static inline unsigned int offset_to_stripe(struct bcache_device *d,
 					uint64_t offset)
 {
 	do_div(offset, d->stripe_size);
@@ -37,9 +37,9 @@ static inline unsigned offset_to_stripe(struct bcache_device *d,
 
 static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
 					   uint64_t offset,
-					   unsigned nr_sectors)
+					   unsigned int nr_sectors)
 {
-	unsigned stripe = offset_to_stripe(&dc->disk, offset);
+	unsigned int stripe = offset_to_stripe(&dc->disk, offset);
 
 	while (1) {
 		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
@@ -54,9 +54,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
 }
 
 static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
-				    unsigned cache_mode, bool would_skip)
+				    unsigned int cache_mode, bool would_skip)
 {
-	unsigned in_use = dc->disk.c->gc_stats.in_use;
+	unsigned int in_use = dc->disk.c->gc_stats.in_use;
 
 	if (cache_mode != CACHE_MODE_WRITEBACK ||
 	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
@@ -96,7 +96,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 	}
 }
 
-void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
+void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned int, uint64_t, int);
 
 void bch_sectors_dirty_init(struct bcache_device *);
 void bch_cached_dev_writeback_init(struct cached_dev *);
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 8d19e02d752a..6bdcb48ee8cf 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -30,10 +30,10 @@ struct bkey {
 	BITMASK(name, struct bkey, field, offset, size)
 
 #define PTR_FIELD(name, offset, size)					\
-static inline __u64 name(const struct bkey *k, unsigned i)		\
+static inline __u64 name(const struct bkey *k, unsigned int i)		\
 { return (k->ptr[i] >> offset) & ~(~0ULL << size); }			\
 									\
-static inline void SET_##name(struct bkey *k, unsigned i, __u64 v)	\
+static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v)	\
 {									\
 	k->ptr[i] &= ~(~(~0ULL << size) << offset);			\
 	k->ptr[i] |= (v & ~(~0ULL << size)) << offset;			\
@@ -120,7 +120,7 @@ static inline struct bkey *bkey_next(const struct bkey *k)
 	return (struct bkey *) (d + bkey_u64s(k));
 }
 
-static inline struct bkey *bkey_idx(const struct bkey *k, unsigned nr_keys)
+static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys)
 {
 	__u64 *d = (void *) k;
 	return (struct bkey *) (d + nr_keys);
-- 
cgit v1.2.3


From 1fae7cf05293d3a2c9e59c1bc59372322386467c Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Sat, 11 Aug 2018 13:19:45 +0800
Subject: bcache: style fix to add a blank line after declarations

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c     |  3 +++
 drivers/md/bcache/bcache.h    |  1 +
 drivers/md/bcache/bset.c      |  5 ++++-
 drivers/md/bcache/btree.c     |  7 +++++++
 drivers/md/bcache/closure.c   |  1 +
 drivers/md/bcache/debug.c     |  4 ++--
 drivers/md/bcache/extents.c   |  5 ++++-
 drivers/md/bcache/io.c        |  4 +++-
 drivers/md/bcache/journal.c   |  2 ++
 drivers/md/bcache/movinggc.c  |  2 ++
 drivers/md/bcache/request.c   |  5 ++++-
 drivers/md/bcache/stats.c     |  3 +++
 drivers/md/bcache/super.c     | 13 ++++++++++++-
 drivers/md/bcache/sysfs.c     |  5 +++++
 drivers/md/bcache/util.c      |  1 +
 drivers/md/bcache/writeback.c |  1 +
 include/uapi/linux/bcache.h   |  2 ++
 17 files changed, 57 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 89f663d22551..7a28232d868b 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -244,6 +244,7 @@ static void invalidate_buckets_random(struct cache *ca)
 
 	while (!fifo_full(&ca->free_inc)) {
 		size_t n;
+
 		get_random_bytes(&n, sizeof(n));
 
 		n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
@@ -514,6 +515,7 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
 			 struct bkey *k, int n, bool wait)
 {
 	int ret;
+
 	mutex_lock(&c->bucket_lock);
 	ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
 	mutex_unlock(&c->bucket_lock);
@@ -706,6 +708,7 @@ int bch_open_buckets_alloc(struct cache_set *c)
 
 	for (i = 0; i < MAX_OPEN_BUCKETS; i++) {
 		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
+
 		if (!b)
 			return -ENOMEM;
 
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 1ebd2d9d90d5..fd74dd075951 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -783,6 +783,7 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
 static inline uint8_t gen_after(uint8_t a, uint8_t b)
 {
 	uint8_t r = a - b;
+
 	return r > 128U ? 0 : r;
 }
 
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index dfda7e9efc3e..6fd5623b2e63 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -585,6 +585,7 @@ static inline unsigned int bfloat_mantissa(const struct bkey *k,
 				       struct bkey_float *f)
 {
 	const uint64_t *p = &k->low - (f->exponent >> 6);
+
 	return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK;
 }
 
@@ -964,6 +965,7 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
 		 * but a branch instruction is avoided.
 		 */
 		unsigned int p = n << 4;
+
 		p &= ((int) (p - t->size)) >> 31;
 
 		prefetch(&t->tree[p]);
@@ -1114,6 +1116,7 @@ static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
 					  struct bset_tree *start)
 {
 	struct bkey *ret = NULL;
+
 	iter->size = ARRAY_SIZE(iter->data);
 	iter->used = 0;
 
@@ -1329,8 +1332,8 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
 			 struct bset_sort_state *state)
 {
 	uint64_t start_time = local_clock();
-
 	struct btree_iter iter;
+
 	bch_btree_iter_init(b, &iter, NULL);
 
 	btree_mergesort(b, new->set->data, &iter, false, true);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 96c39a8db895..4003f92f4d2c 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -287,6 +287,7 @@ err:
 static void btree_node_read_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
+
 	closure_put(cl);
 }
 
@@ -604,6 +605,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
 				      struct bkey *k, gfp_t gfp)
 {
 	struct btree *b = kzalloc(sizeof(struct btree), gfp);
+
 	if (!b)
 		return NULL;
 
@@ -746,6 +748,7 @@ void bch_btree_cache_free(struct cache_set *c)
 {
 	struct btree *b;
 	struct closure cl;
+
 	closure_init_stack(&cl);
 
 	if (c->shrink.list.next)
@@ -1124,6 +1127,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b,
 						  struct btree_op *op)
 {
 	struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
+
 	if (!IS_ERR_OR_NULL(n)) {
 		mutex_lock(&n->write_lock);
 		bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
@@ -2488,6 +2492,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
 
 	if (!RB_EMPTY_ROOT(&buf->keys)) {
 		struct keybuf_key *w;
+
 		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
 		buf->start	= START_KEY(&w->key);
 
@@ -2519,6 +2524,7 @@ bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
 {
 	bool ret = false;
 	struct keybuf_key *p, *w, s;
+
 	s.key = *start;
 
 	if (bkey_cmp(end, &buf->start) <= 0 ||
@@ -2545,6 +2551,7 @@ bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
 struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
 {
 	struct keybuf_key *w;
+
 	spin_lock(&buf->lock);
 
 	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 618253683d40..8570fc426e31 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -162,6 +162,7 @@ static struct dentry *closure_debug;
 static int debug_seq_show(struct seq_file *f, void *data)
 {
 	struct closure *cl;
+
 	spin_lock_irq(&closure_list_lock);
 
 	list_for_each_entry(cl, &closure_list, all) {
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 0caad145902b..f0eb37a14dab 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -177,8 +177,8 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
 	while (size) {
 		struct keybuf_key *w;
 		unsigned int bytes = min(i->bytes, size);
-
 		int err = copy_to_user(buf, i->buf, bytes);
+
 		if (err)
 			return err;
 
@@ -237,8 +237,8 @@ void bch_debug_init_cache_set(struct cache_set *c)
 {
 	if (!IS_ERR_OR_NULL(bcache_debug)) {
 		char name[50];
-		snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
 
+		snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
 		c->debug = debugfs_create_file(name, 0400, bcache_debug, c,
 					       &cache_set_debug_ops);
 	}
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index e96ba928eeb6..8f5de61e1a90 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -134,8 +134,8 @@ static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
 
 	for (j = 0; j < KEY_PTRS(k); j++) {
 		size_t n = PTR_BUCKET_NR(b->c, k, j);
-		printk(" bucket %zu", n);
 
+		printk(" bucket %zu", n);
 		if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
 			printk(" prio %i",
 			       PTR_BUCKET(b->c, k, j)->prio);
@@ -166,6 +166,7 @@ bad:
 static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
 {
 	struct btree *b = container_of(bk, struct btree, keys);
+
 	return __bch_btree_ptr_invalid(b->c, k);
 }
 
@@ -334,6 +335,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
 
 	while (1) {
 		struct bkey *k = bch_btree_iter_next(iter);
+
 		if (!k)
 			break;
 
@@ -498,6 +500,7 @@ bad:
 static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
 {
 	struct btree *b = container_of(bk, struct btree, keys);
+
 	return __bch_extent_invalid(b->c, k);
 }
 
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index c6b41a09f550..cfc56add799a 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -17,6 +17,7 @@
 void bch_bbio_free(struct bio *bio, struct cache_set *c)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
+
 	mempool_free(b, &c->bio_meta);
 }
 
@@ -45,6 +46,7 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
 		     struct bkey *k, unsigned int ptr)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
+
 	bch_bkey_copy_single_ptr(&b->key, k, ptr);
 	__bch_submit_bbio(bio, c);
 }
@@ -132,12 +134,12 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
 
 	if (threshold) {
 		unsigned int t = local_clock_us();
-
 		int us = t - b->submit_time_us;
 		int congested = atomic_read(&c->congested);
 
 		if (us > (int) threshold) {
 			int ms = us / 1024;
+
 			c->congested_last_us = t;
 
 			ms = min(ms, CONGESTED_MAX + congested);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ee61062b58fc..301cbb43a78f 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -28,6 +28,7 @@
 static void journal_read_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
+
 	closure_put(cl);
 }
 
@@ -614,6 +615,7 @@ static void journal_write_unlocked(struct closure *cl)
 
 	struct bio *bio;
 	struct bio_list list;
+
 	bio_list_init(&list);
 
 	if (!w->need_write) {
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 0790d710f911..7891fb512736 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -38,6 +38,7 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
 static void moving_io_destructor(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
+
 	kfree(io);
 }
 
@@ -189,6 +190,7 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r)
 static unsigned int bucket_heap_top(struct cache *ca)
 {
 	struct bucket *b;
+
 	return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
 }
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 6e1a60dd1742..d15d8c5778ed 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -45,6 +45,7 @@ static void bio_csum(struct bio *bio, struct bkey *k)
 
 	bio_for_each_segment(bv, bio, iter) {
 		void *d = kmap(bv.bv_page) + bv.bv_offset;
+
 		csum = bch_crc64_update(csum, d, bv.bv_len);
 		kunmap(bv.bv_page);
 	}
@@ -526,8 +527,8 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 			? min_t(uint64_t, INT_MAX,
 				KEY_START(k) - bio->bi_iter.bi_sector)
 			: INT_MAX;
-
 		int ret = s->d->cache_miss(b, s, bio, sectors);
+
 		if (ret != MAP_CONTINUE)
 			return ret;
 
@@ -623,6 +624,7 @@ static void request_endio(struct bio *bio)
 
 	if (bio->bi_status) {
 		struct search *s = container_of(cl, struct search, cl);
+
 		s->iop.status = bio->bi_status;
 		/* Only cache read errors are recoverable */
 		s->recoverable = false;
@@ -1212,6 +1214,7 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
 			    unsigned int cmd, unsigned long arg)
 {
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
 	return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
 }
 
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 2331a0d5aa28..894410f3f829 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -200,6 +200,7 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
 			       bool hit, bool bypass)
 {
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
 	mark_cache_stats(&dc->accounting.collector, hit, bypass);
 	mark_cache_stats(&c->accounting.collector, hit, bypass);
 }
@@ -207,6 +208,7 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
 void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
 {
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
 	atomic_inc(&dc->accounting.collector.cache_readaheads);
 	atomic_inc(&c->accounting.collector.cache_readaheads);
 }
@@ -214,6 +216,7 @@ void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
 void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d)
 {
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
 	atomic_inc(&dc->accounting.collector.cache_miss_collisions);
 	atomic_inc(&c->accounting.collector.cache_miss_collisions);
 }
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 4ab1b1968d9a..c11cf852715c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -415,8 +415,8 @@ static int __uuid_write(struct cache_set *c)
 {
 	BKEY_PADDED(key) k;
 	struct closure cl;
-	closure_init_stack(&cl);
 
+	closure_init_stack(&cl);
 	lockdep_assert_held(&bch_register_lock);
 
 	if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
@@ -456,6 +456,7 @@ static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
 static struct uuid_entry *uuid_find_empty(struct cache_set *c)
 {
 	static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+
 	return uuid_find(c, zero_uuid);
 }
 
@@ -619,6 +620,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
 static int open_dev(struct block_device *b, fmode_t mode)
 {
 	struct bcache_device *d = b->bd_disk->private_data;
+
 	if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
 		return -ENXIO;
 
@@ -629,6 +631,7 @@ static int open_dev(struct block_device *b, fmode_t mode)
 static void release_dev(struct gendisk *b, fmode_t mode)
 {
 	struct bcache_device *d = b->private_data;
+
 	closure_put(&d->cl);
 }
 
@@ -919,6 +922,7 @@ void bch_cached_dev_run(struct cached_dev *dc)
 	if (!d->c &&
 	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
 		struct closure cl;
+
 		closure_init_stack(&cl);
 
 		SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
@@ -976,6 +980,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
 {
 	struct cached_dev *dc = container_of(w, struct cached_dev, detach);
 	struct closure cl;
+
 	closure_init_stack(&cl);
 
 	BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
@@ -1103,6 +1108,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 
 	if (bch_is_zero(u->uuid, 16)) {
 		struct closure cl;
+
 		closure_init_stack(&cl);
 
 		memcpy(u->uuid, dc->sb.uuid, 16);
@@ -1320,6 +1326,7 @@ void bch_flash_dev_release(struct kobject *kobj)
 static void flash_dev_free(struct closure *cl)
 {
 	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+
 	mutex_lock(&bch_register_lock);
 	atomic_long_sub(bcache_dev_sectors_dirty(d),
 			&d->c->flash_dev_dirty_sectors);
@@ -1481,6 +1488,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
 void bch_cache_set_release(struct kobject *kobj)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
 	kfree(c);
 	module_put(THIS_MODULE);
 }
@@ -1671,6 +1679,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 {
 	int iter_size;
 	struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
+
 	if (!c)
 		return NULL;
 
@@ -2216,6 +2225,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	err = "failed to register device";
 	if (SB_IS_BDEV(sb)) {
 		struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
+
 		if (!dc)
 			goto err_close;
 
@@ -2224,6 +2234,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 		mutex_unlock(&bch_register_lock);
 	} else {
 		struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+
 		if (!ca)
 			goto err_close;
 
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 3f2b7964d6a9..ba4cd7efca8e 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -459,6 +459,7 @@ STORE(__bch_flash_dev)
 
 	if (attr == &sysfs_size) {
 		uint64_t v;
+
 		strtoi_h_or_return(buf, v);
 
 		u->sectors = v >> 9;
@@ -703,6 +704,7 @@ STORE(__bch_cache_set)
 	if (attr == &sysfs_flash_vol_create) {
 		int r;
 		uint64_t v;
+
 		strtoi_h_or_return(buf, v);
 
 		r = bch_flash_dev_create(c, v);
@@ -736,6 +738,7 @@ STORE(__bch_cache_set)
 
 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;
+
 		sc.gfp_mask = GFP_KERNEL;
 		sc.nr_to_scan = strtoul_or_return(buf);
 		c->shrink.scan_objects(&c->shrink, &sc);
@@ -789,12 +792,14 @@ STORE_LOCKED(bch_cache_set)
 SHOW(bch_cache_set_internal)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, internal);
+
 	return bch_cache_set_show(&c->kobj, attr, buf);
 }
 
 STORE(bch_cache_set_internal)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, internal);
+
 	return bch_cache_set_store(&c->kobj, attr, buf, size);
 }
 
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index b15256bcf0e7..18016e7bb32c 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -133,6 +133,7 @@ bool bch_is_zero(const char *p, size_t n)
 int bch_parse_uuid(const char *s, char *uuid)
 {
 	size_t i, j, x;
+
 	memset(uuid, 0, 16);
 
 	for (i = 0, j = 0;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 39ee38ffb2db..44f1b0f1f4d9 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -250,6 +250,7 @@ static void dirty_init(struct keybuf_key *w)
 static void dirty_io_destructor(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
 	kfree(io);
 }
 
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 6bdcb48ee8cf..5d4f58e059fd 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -117,12 +117,14 @@ static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
 static inline struct bkey *bkey_next(const struct bkey *k)
 {
 	__u64 *d = (void *) k;
+
 	return (struct bkey *) (d + bkey_u64s(k));
 }
 
 static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys)
 {
 	__u64 *d = (void *) k;
+
 	return (struct bkey *) (d + nr_keys);
 }
 /* Enough for a key with 6 pointers */
-- 
cgit v1.2.3


From 7723628101aaeb1d723786747529b4ea65c5b5c5 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Sun, 12 Aug 2018 10:49:27 -0700
Subject: bpf: Introduce bpf_skb_ancestor_cgroup_id helper

== Problem description ==

It's useful to be able to identify cgroup associated with skb in TC so
that a policy can be applied to this skb, and existing bpf_skb_cgroup_id
helper can help with this.

Though in real life cgroup hierarchy and hierarchy to apply a policy to
don't map 1:1.

It's often the case that there is a container and corresponding cgroup,
but there are many more sub-cgroups inside container, e.g. because it's
delegated to containerized application to control resources for its
subsystems, or to separate application inside container from infra that
belongs to containerization system (e.g. sshd).

At the same time it may be useful to apply a policy to container as a
whole.

If multiple containers like this are run on a host (what is often the
case) and many of them have sub-cgroups, it may not be possible to apply
per-container policy in TC with existing helpers such as
bpf_skb_under_cgroup or bpf_skb_cgroup_id:

* bpf_skb_cgroup_id will return id of immediate cgroup associated with
  skb, i.e. if it's a sub-cgroup inside container, it can't be used to
  identify container's cgroup;

* bpf_skb_under_cgroup can work only with one cgroup and doesn't scale,
  i.e. if there are N containers on a host and a policy has to be
  applied to M of them (0 <= M <= N), it'd require M calls to
  bpf_skb_under_cgroup, and, if M changes, it'd require to rebuild &
  load new BPF program.

== Solution ==

The patch introduces new helper bpf_skb_ancestor_cgroup_id that can be
used to get id of cgroup v2 that is an ancestor of cgroup associated
with skb at specified level of cgroup hierarchy.

That way admin can place all containers on one level of cgroup hierarchy
(what is a good practice in general and already used in many
configurations) and identify specific cgroup on this level no matter
what sub-cgroup skb is associated with.

E.g. if there is a cgroup hierarchy:
  root/
  root/container1/
  root/container1/app11/
  root/container1/app11/sub-app-a/
  root/container1/app12/
  root/container2/
  root/container2/app21/
  root/container2/app22/
  root/container2/app22/sub-app-b/

, then having skb associated with root/container1/app11/sub-app-a/ it's
possible to get ancestor at level 1, what is container1 and apply policy
for this container, or apply another policy if it's container2.

Policies can be kept e.g. in a hash map where key is a container cgroup
id and value is an action.

Levels where container cgroups are created are usually known in advance
whether cgroup hierarchy inside container may be hard to predict
especially in case when its creation is delegated to containerized
application.

== Implementation details ==

The helper gets ancestor by walking parents up to specified level.

Another option would be to get different kind of "id" from
cgroup->ancestor_ids[level] and use it with idr_find() to get struct
cgroup for ancestor. But that would require radix lookup what doesn't
seem to be better (at least it's not obviously better).

Format of return value of the new helper is same as that of
bpf_skb_cgroup_id.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/cgroup.h   | 30 ++++++++++++++++++++++++++++++
 include/uapi/linux/bpf.h | 21 ++++++++++++++++++++-
 net/core/filter.c        | 28 ++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c9fdf6f57913..32c553556bbd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -553,6 +553,36 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
 	return cgrp->ancestor_ids[ancestor->level] == ancestor->id;
 }
 
+/**
+ * cgroup_ancestor - find ancestor of cgroup
+ * @cgrp: cgroup to find ancestor of
+ * @ancestor_level: level of ancestor to find starting from root
+ *
+ * Find ancestor of cgroup at specified level starting from root if it exists
+ * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
+ * @ancestor_level.
+ *
+ * This function is safe to call as long as @cgrp is accessible.
+ */
+static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
+					     int ancestor_level)
+{
+	struct cgroup *ptr;
+
+	if (cgrp->level < ancestor_level)
+		return NULL;
+
+	for (ptr = cgrp;
+	     ptr && ptr->level > ancestor_level;
+	     ptr = cgroup_parent(ptr))
+		;
+
+	if (ptr && ptr->level == ancestor_level)
+		return ptr;
+
+	return NULL;
+}
+
 /**
  * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
  * @task: the task to be tested
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3102a2a23c31..66917a4eba27 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2093,6 +2093,24 @@ union bpf_attr {
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
  *
+ * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
+ *	Description
+ *		Return id of cgroup v2 that is ancestor of cgroup associated
+ *		with the *skb* at the *ancestor_level*.  The root cgroup is at
+ *		*ancestor_level* zero and each step down the hierarchy
+ *		increments the level. If *ancestor_level* == level of cgroup
+ *		associated with *skb*, then return value will be same as that
+ *		of **bpf_skb_cgroup_id**\ ().
+ *
+ *		The helper is useful to implement policies based on cgroups
+ *		that are upper in hierarchy than immediate cgroup associated
+ *		with *skb*.
+ *
+ *		The format of returned id and helper limitations are same as in
+ *		**bpf_skb_cgroup_id**\ ().
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
+ *
  * u64 bpf_get_current_cgroup_id(void)
  * 	Return
  * 		A 64-bit integer containing the current cgroup id based
@@ -2207,7 +2225,8 @@ union bpf_attr {
 	FN(skb_cgroup_id),		\
 	FN(get_current_cgroup_id),	\
 	FN(get_local_storage),		\
-	FN(sk_select_reuseport),
+	FN(sk_select_reuseport),	\
+	FN(skb_ancestor_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 22906b31d43f..15b9d2df92ca 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3778,6 +3778,32 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
 	.ret_type       = RET_INTEGER,
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
+
+BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
+	   ancestor_level)
+{
+	struct sock *sk = skb_to_full_sk(skb);
+	struct cgroup *ancestor;
+	struct cgroup *cgrp;
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	ancestor = cgroup_ancestor(cgrp, ancestor_level);
+	if (!ancestor)
+		return 0;
+
+	return ancestor->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
+	.func           = bpf_skb_ancestor_cgroup_id,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+};
 #endif
 
 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
@@ -4966,6 +4992,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #ifdef CONFIG_SOCK_CGROUP_DATA
 	case BPF_FUNC_skb_cgroup_id:
 		return &bpf_skb_cgroup_id_proto;
+	case BPF_FUNC_skb_ancestor_cgroup_id:
+		return &bpf_skb_ancestor_cgroup_id_proto;
 #endif
 	default:
 		return bpf_base_func_proto(func_id);
-- 
cgit v1.2.3


From e6f86b0f7ae473969a3301b74bf98af9e42ecd0e Mon Sep 17 00:00:00 2001
From: Virgile Jarry <virgile@acceis.fr>
Date: Fri, 10 Aug 2018 17:48:15 +0200
Subject: ipv6: Add icmp_echo_ignore_all support for ICMPv6

Preventing the kernel from responding to ICMP Echo Requests messages
can be useful in several ways. The sysctl parameter
'icmp_echo_ignore_all' can be used to prevent the kernel from
responding to IPv4 ICMP echo requests. For IPv6 pings, such
a sysctl kernel parameter did not exist.

Add the ability to prevent the kernel from responding to IPv6
ICMP echo requests through the use of the following sysctl
parameter : /proc/sys/net/ipv6/icmp/echo_ignore_all.
Update the documentation to reflect this change.

Signed-off-by: Virgile Jarry <virgile@acceis.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  5 +++++
 include/net/netns/ipv6.h               |  1 +
 include/uapi/linux/sysctl.h            |  3 ++-
 net/ipv6/af_inet6.c                    |  1 +
 net/ipv6/icmp.c                        | 16 +++++++++++++---
 5 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e74515ecaa9c..8313a636dd53 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1882,6 +1882,11 @@ ratelimit - INTEGER
 	otherwise the minimal space between responses in milliseconds.
 	Default: 1000
 
+echo_ignore_all - BOOLEAN
+	If set non-zero, then the kernel will ignore all ICMP ECHO
+	requests sent to it over the IPv6 protocol.
+	Default: 0
+
 xfrm6_gc_thresh - INTEGER
 	The threshold at which we will start garbage collecting for IPv6
 	destination cache entries.  At twice this value the system will
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 762ac9931b62..f0e396ab9bec 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -32,6 +32,7 @@ struct netns_sysctl_ipv6 {
 	int flowlabel_consistency;
 	int auto_flowlabels;
 	int icmpv6_time;
+	int icmpv6_echo_ignore_all;
 	int anycast_src_echo_reply;
 	int ip_nonlocal_bind;
 	int fwmark_reflect;
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 6b58371b1f0d..d71013fffaf6 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -575,7 +575,8 @@ enum {
 
 /* /proc/sys/net/ipv6/icmp */
 enum {
-	NET_IPV6_ICMP_RATELIMIT=1
+	NET_IPV6_ICMP_RATELIMIT = 1,
+	NET_IPV6_ICMP_ECHO_IGNORE_ALL = 2
 };
 
 /* /proc/sys/net/<protocol>/neigh/<dev> */
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 020f6e14a7af..673bba31eb18 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -832,6 +832,7 @@ static int __net_init inet6_net_init(struct net *net)
 
 	net->ipv6.sysctl.bindv6only = 0;
 	net->ipv6.sysctl.icmpv6_time = 1*HZ;
+	net->ipv6.sysctl.icmpv6_echo_ignore_all = 0;
 	net->ipv6.sysctl.flowlabel_consistency = 1;
 	net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS;
 	net->ipv6.sysctl.idgen_retries = 3;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 7f6b1f81c200..c9c53ade55c3 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -794,6 +794,7 @@ out:
 
 static int icmpv6_rcv(struct sk_buff *skb)
 {
+	struct net *net = dev_net(skb->dev);
 	struct net_device *dev = skb->dev;
 	struct inet6_dev *idev = __in6_dev_get(dev);
 	const struct in6_addr *saddr, *daddr;
@@ -843,7 +844,8 @@ static int icmpv6_rcv(struct sk_buff *skb)
 
 	switch (type) {
 	case ICMPV6_ECHO_REQUEST:
-		icmpv6_echo_reply(skb);
+		if (!net->ipv6.sysctl.icmpv6_echo_ignore_all)
+			icmpv6_echo_reply(skb);
 		break;
 
 	case ICMPV6_ECHO_REPLY:
@@ -1104,6 +1106,13 @@ static struct ctl_table ipv6_icmp_table_template[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_ms_jiffies,
 	},
+	{
+		.procname	= "echo_ignore_all",
+		.data		= &init_net.ipv6.sysctl.icmpv6_echo_ignore_all,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler = proc_dointvec,
+	},
 	{ },
 };
 
@@ -1115,9 +1124,10 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
 			sizeof(ipv6_icmp_table_template),
 			GFP_KERNEL);
 
-	if (table)
+	if (table) {
 		table[0].data = &net->ipv6.sysctl.icmpv6_time;
-
+		table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all;
+	}
 	return table;
 }
 #endif
-- 
cgit v1.2.3


From cdb2f401246ef70b37eeedd9c6f9c28d611d8255 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 13 Aug 2018 18:43:36 +0300
Subject: netfilter: uapi: fix linux/netfilter/nf_osf.h userspace compilation
 errors

Move inclusion of <linux/ip.h> and <linux/tcp.h> from
linux/netfilter/xt_osf.h to linux/netfilter/nf_osf.h to fix
the following linux/netfilter/nf_osf.h userspace compilation errors:

/usr/include/linux/netfilter/nf_osf.h:59:24: error: 'MAX_IPOPTLEN' undeclared here (not in a function)
  struct nf_osf_opt opt[MAX_IPOPTLEN];
/usr/include/linux/netfilter/nf_osf.h:64:17: error: field 'ip' has incomplete type
  struct iphdr   ip;
/usr/include/linux/netfilter/nf_osf.h:65:18: error: field 'tcp' has incomplete type
  struct tcphdr   tcp;

Fixes: bfb15f2a95cb ("netfilter: extract Passive OS fingerprint infrastructure from xt_osf")
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_osf.h | 2 ++
 include/uapi/linux/netfilter/xt_osf.h        | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nfnetlink_osf.h b/include/uapi/linux/netfilter/nfnetlink_osf.h
index 76a3527df5dd..272bc3195f2d 100644
--- a/include/uapi/linux/netfilter/nfnetlink_osf.h
+++ b/include/uapi/linux/netfilter/nfnetlink_osf.h
@@ -2,6 +2,8 @@
 #define _NF_OSF_H
 
 #include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
 
 #define MAXGENRELEN	32
 
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index 24102b5286ec..6e466236ca4b 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -21,8 +21,6 @@
 #define _XT_OSF_H
 
 #include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
 #include <linux/netfilter/nfnetlink_osf.h>
 
 #define XT_OSF_GENRE		NF_OSF_GENRE
-- 
cgit v1.2.3


From cbf6898fd69455092c43cd573b38d42c86ddb1e0 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 21 Aug 2018 21:59:01 -0700
Subject: autofs: add AUTOFS_EXP_FORCED flag

The userspace automount(8) daemon is meant to perform a forced expire when
sent a SIGUSR2.

But since the expiration is routed through the kernel and the kernel
doesn't send an expire request if the mount is busy this hasn't worked at
least since autofs version 5.

Add an AUTOFS_EXP_FORCED flag to allow implemention of the feature and
bump the protocol version so user space can check if it's implemented if
needed.

Link: http://lkml.kernel.org/r/152937734715.21213.6594007182776598970.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs/expire.c           | 62 +++++++++++++++++++++++++++++++++++---------
 include/uapi/linux/auto_fs.h |  8 +++---
 2 files changed, 55 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index dfb666c5b8a2..d441244b79df 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -29,7 +29,8 @@ static inline int autofs_can_expire(struct dentry *dentry,
 }
 
 /* Check a mount point for busyness */
-static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
+static int autofs_mount_busy(struct vfsmount *mnt,
+			     struct dentry *dentry, unsigned int how)
 {
 	struct dentry *top = dentry;
 	struct path path = {.mnt = mnt, .dentry = dentry};
@@ -50,6 +51,12 @@ static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 			goto done;
 	}
 
+	/* Not a submount, has a forced expire been requested */
+	if (how & AUTOFS_EXP_FORCED) {
+		status = 0;
+		goto done;
+	}
+
 	/* Update the expiry counter if fs is busy */
 	if (!may_umount_tree(path.mnt)) {
 		struct autofs_info *ino;
@@ -189,6 +196,10 @@ static int autofs_direct_busy(struct vfsmount *mnt,
 {
 	pr_debug("top %p %pd\n", top, top);
 
+	/* Forced expire, user space handles busy mounts */
+	if (how & AUTOFS_EXP_FORCED)
+		return 0;
+
 	/* If it's busy update the expiry counters */
 	if (!may_umount_tree(mnt)) {
 		struct autofs_info *ino;
@@ -235,7 +246,7 @@ static int autofs_tree_busy(struct vfsmount *mnt,
 		 * If the fs is busy update the expiry counter.
 		 */
 		if (d_mountpoint(p)) {
-			if (autofs_mount_busy(mnt, p)) {
+			if (autofs_mount_busy(mnt, p, how)) {
 				top_ino->last_used = jiffies;
 				dput(p);
 				return 1;
@@ -258,6 +269,10 @@ static int autofs_tree_busy(struct vfsmount *mnt,
 		}
 	}
 
+	/* Forced expire, user space handles busy mounts */
+	if (how & AUTOFS_EXP_FORCED)
+		return 0;
+
 	/* Timeout of a tree mount is ultimately determined by its top dentry */
 	if (!autofs_can_expire(top, timeout, how))
 		return 1;
@@ -280,9 +295,15 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt,
 
 		if (d_mountpoint(p)) {
 			/* Can we umount this guy */
-			if (autofs_mount_busy(mnt, p))
+			if (autofs_mount_busy(mnt, p, how))
 				continue;
 
+			/* This isn't a submount so if a forced expire
+			 * has been requested, user space handles busy
+			 * mounts */
+			if (how & AUTOFS_EXP_FORCED)
+				return p;
+
 			/* Can we expire this guy */
 			if (autofs_can_expire(p, timeout, how))
 				return p;
@@ -361,9 +382,15 @@ static struct dentry *should_expire(struct dentry *dentry,
 		pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
 
 		/* Can we umount this guy */
-		if (autofs_mount_busy(mnt, dentry))
+		if (autofs_mount_busy(mnt, dentry, how))
 			return NULL;
 
+		/* This isn't a submount so if a forced expire
+		 * has been requested, user space handles busy
+		 * mounts */
+		if (how & AUTOFS_EXP_FORCED)
+			return dentry;
+
 		/* Can we expire this guy */
 		if (autofs_can_expire(dentry, timeout, how))
 			return dentry;
@@ -372,6 +399,11 @@ static struct dentry *should_expire(struct dentry *dentry,
 
 	if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
 		pr_debug("checking symlink %p %pd\n", dentry, dentry);
+
+		/* Forced expire, user space handles busy mounts */
+		if (how & AUTOFS_EXP_FORCED)
+			return dentry;
+
 		/*
 		 * A symlink can't be "busy" in the usual sense so
 		 * just check last used for expire timeout.
@@ -386,10 +418,13 @@ static struct dentry *should_expire(struct dentry *dentry,
 
 	/* Case 2: tree mount, expire iff entire tree is not busy */
 	if (!(how & AUTOFS_EXP_LEAVES)) {
-		/* Path walk currently on this dentry? */
-		ino_count = atomic_read(&ino->count) + 1;
-		if (d_count(dentry) > ino_count)
-			return NULL;
+		/* Not a forced expire? */
+		if (!(how & AUTOFS_EXP_FORCED)) {
+			/* ref-walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 1;
+			if (d_count(dentry) > ino_count)
+				return NULL;
+		}
 
 		if (!autofs_tree_busy(mnt, dentry, timeout, how))
 			return dentry;
@@ -398,12 +433,15 @@ static struct dentry *should_expire(struct dentry *dentry,
 	 *	   (autofs-4.1).
 	 */
 	} else {
-		/* Path walk currently on this dentry? */
 		struct dentry *expired;
 
-		ino_count = atomic_read(&ino->count) + 1;
-		if (d_count(dentry) > ino_count)
-			return NULL;
+		/* Not a forced expire? */
+		if (!(how & AUTOFS_EXP_FORCED)) {
+			/* ref-walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 1;
+			if (d_count(dentry) > ino_count)
+				return NULL;
+		}
 
 		expired = autofs_check_leaves(mnt, dentry, timeout, how);
 		if (expired) {
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index e13eec3dfb2f..df31aa9c9a8c 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -23,7 +23,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	3
 #define AUTOFS_MAX_PROTO_VERSION	5
 
-#define AUTOFS_PROTO_SUBVERSION		2
+#define AUTOFS_PROTO_SUBVERSION		3
 
 /*
  * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
@@ -90,8 +90,10 @@ enum {
 /* autofs version 4 and later definitions */
 
 /* Mask for expire behaviour */
-#define AUTOFS_EXP_IMMEDIATE		1
-#define AUTOFS_EXP_LEAVES		2
+#define AUTOFS_EXP_NORMAL		0x00
+#define AUTOFS_EXP_IMMEDIATE		0x01
+#define AUTOFS_EXP_LEAVES		0x02
+#define AUTOFS_EXP_FORCED		0x04
 
 #define AUTOFS_TYPE_ANY			0U
 #define AUTOFS_TYPE_INDIRECT		1U
-- 
cgit v1.2.3