From 7b6f08771bf6045c32a2a1e18201c1aeeb78d72a Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Wed, 7 Sep 2022 10:34:30 +0300
Subject: wifi: ieee80211: Support validating ML station profile length

Add a function to validate EHT Multi-Link per station profile length.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 79690938d9a2..bdf668f9dace 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4685,6 +4685,46 @@ struct ieee80211_mle_per_sta_profile {
 	u8 variable[];
 } __packed;
 
+/**
+ * ieee80211_mle_sta_prof_size_ok - validate multi-link element sta profile size
+ * @data: pointer to the sub element data
+ * @len: length of the containing sub element
+ */
+static inline bool ieee80211_mle_sta_prof_size_ok(const u8 *data, size_t len)
+{
+	const struct ieee80211_mle_per_sta_profile *prof = (const void *)data;
+	u16 control;
+	u8 fixed = sizeof(*prof);
+	u8 info_len = 1;
+
+	if (len < fixed)
+		return false;
+
+	control = le16_to_cpu(prof->control);
+
+	if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT)
+		info_len += 6;
+	if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT)
+		info_len += 2;
+	if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT)
+		info_len += 8;
+	if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT)
+		info_len += 2;
+	if (control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT)
+		info_len += 1;
+
+	if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE &&
+	    control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) {
+		if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE)
+			info_len += 2;
+		else
+			info_len += 1;
+	}
+
+	return prof->sta_info_len >= info_len &&
+	       fixed + prof->sta_info_len <= len;
+}
+
 #define for_each_mle_subelement(_elem, _data, _len)			\
 	if (ieee80211_mle_size_ok(_data, _len))				\
 		for_each_element(_elem,					\
-- 
cgit v1.2.3


From 1403b109c9a5244dc6ab79154f04eecc209ef3d2 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Wed, 7 Sep 2022 17:23:09 +0300
Subject: wifi: cfg80211/mac80211: Fix ML element common size calculation

The common size is part of the length in the data
so don't add it again.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index bdf668f9dace..442b13333da8 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4573,18 +4573,17 @@ static inline u8 ieee80211_mle_common_size(const u8 *data)
 
 	switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) {
 	case IEEE80211_ML_CONTROL_TYPE_BASIC:
-		common += sizeof(struct ieee80211_mle_basic_common_info);
-		break;
 	case IEEE80211_ML_CONTROL_TYPE_PREQ:
-		common += sizeof(struct ieee80211_mle_preq_common_info);
+	case IEEE80211_ML_CONTROL_TYPE_TDLS:
+		/*
+		 * The length is the first octet pointed by mle->variable so no
+		 * need to add anything
+		 */
 		break;
 	case IEEE80211_ML_CONTROL_TYPE_RECONF:
 		if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR)
 			common += ETH_ALEN;
 		return common;
-	case IEEE80211_ML_CONTROL_TYPE_TDLS:
-		common += sizeof(struct ieee80211_mle_tdls_common_info);
-		break;
 	case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS:
 		if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR)
 			common += ETH_ALEN;
-- 
cgit v1.2.3


From fb99c7d4d6d0fb4fe5a953e0c5f6c37a5b796b98 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Sun, 11 Sep 2022 16:55:12 +0300
Subject: wifi: cfg80211/mac80211: Fix ML element common size validation

The Multi-Link element can be fragmented, thus its size can exceed 254.
Thus, modify ieee80211_mle_size_ok() to use 'size_t len' instead of
'u8 len'.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 442b13333da8..b935a85b2f44 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4601,7 +4601,7 @@ static inline u8 ieee80211_mle_common_size(const u8 *data)
  * @data: pointer to the element data
  * @len: length of the containing element
  */
-static inline bool ieee80211_mle_size_ok(const u8 *data, u8 len)
+static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len)
 {
 	const struct ieee80211_multi_link_elem *mle = (const void *)data;
 	u8 fixed = sizeof(*mle);
-- 
cgit v1.2.3


From 45ebac4f059b92906e7e86dd1a780739f883857c Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Tue, 6 Sep 2022 11:48:56 +0300
Subject: wifi: mac80211: Parse station profile from association response

When processing an association response frame for a Multi-Link
connection, extract the per station profile for each additional
link, and use it for parsing the link elements.

As the Multi-Link element might be fragmented, add support for
reassembling a fragmented element. To simplify memory management
logic, extend 'struct ieee802_11_elems' to hold a scratch buffer,
which is used for the defragmentation. Once an element is
reconstructed in the scratch area, point the corresponding element
pointer to it. Currently only defragmentation of Multi-Link element
and the contained per-STA profile subelement is supported.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |   1 +
 net/mac80211/ieee80211_i.h |  26 +++++++-
 net/mac80211/mlme.c        |  17 ++++-
 net/mac80211/util.c        | 150 ++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 188 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index b935a85b2f44..f51e939f28f2 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4666,6 +4666,7 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len)
 
 enum ieee80211_mle_subelems {
 	IEEE80211_MLE_SUBELEM_PER_STA_PROFILE		= 0,
+	IEEE80211_MLE_SUBELEM_FRAGMENT		        = 254,
 };
 
 #define IEEE80211_MLE_STA_CONTROL_LINK_ID			0x000f
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 4e1d4c339f2d..5dcbc8de53fd 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1707,8 +1707,27 @@ struct ieee802_11_elems {
 	u8 tx_pwr_env_num;
 	u8 eht_cap_len;
 
+	/* mult-link element can be de-fragmented and thus u8 is not sufficient */
+	size_t multi_link_len;
+
+	/*
+	 * store the per station profile pointer and length in case that the
+	 * parsing also handled Multi-Link element parsing for a specific link
+	 * ID.
+	 */
+	struct ieee80211_mle_per_sta_profile *prof;
+	size_t sta_prof_len;
+
 	/* whether a parse error occurred while retrieving these elements */
 	bool parse_error;
+
+	/*
+	 * scratch buffer that can be used for various element parsing related
+	 * tasks, e.g., element de-fragmentation etc.
+	 */
+	size_t scratch_len;
+	u8 *scratch_pos;
+	u8 scratch[];
 };
 
 static inline struct ieee80211_local *hw_to_local(
@@ -2197,9 +2216,13 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata,
  *	represent a non-transmitting BSS in which case the data
  *	for that non-transmitting BSS is returned
  * @link_id: the link ID to parse elements for, if a STA profile
- *	is present in the multi-link element, or -1 to ignore
+ *	is present in the multi-link element, or -1 to ignore;
+ *	note that the code currently assumes parsing an association
+ *	(or re-association) response frame if this is given
  * @from_ap: frame is received from an AP (currently used only
  *	for EHT capabilities parsing)
+ * @scratch_len: if non zero, specifies the requested length of the scratch
+ *      buffer; otherwise, 'len' is used.
  */
 struct ieee80211_elems_parse_params {
 	const u8 *start;
@@ -2210,6 +2233,7 @@ struct ieee80211_elems_parse_params {
 	struct cfg80211_bss *bss;
 	int link_id;
 	bool from_ap;
+	size_t scratch_len;
 };
 
 struct ieee802_11_elems *
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 54b8d5065bbd..a7e06c8ddaf3 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3923,11 +3923,12 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
 	struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data;
 	struct ieee80211_bss_conf *bss_conf = link->conf;
 	struct ieee80211_local *local = sdata->local;
+	unsigned int link_id = link->link_id;
 	struct ieee80211_elems_parse_params parse_params = {
 		.start = elem_start,
 		.len = elem_len,
 		.bss = cbss,
-		.link_id = link == &sdata->deflink ? -1 : link->link_id,
+		.link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id,
 		.from_ap = true,
 	};
 	bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ;
@@ -3942,8 +3943,18 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
 	if (!elems)
 		return false;
 
-	/* FIXME: use from STA profile element after parsing that */
-	capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
+	if (link_id == assoc_data->assoc_link_id) {
+		capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
+	} else if (!elems->prof) {
+		ret = false;
+		goto out;
+	} else {
+		const u8 *ptr = elems->prof->variable +
+				elems->prof->sta_info_len - 1;
+
+		/* FIXME: need to also handle the status code */
+		capab_info = get_unaligned_le16(ptr);
+	}
 
 	if (!is_s1g && !elems->supp_rates) {
 		sdata_info(sdata, "no SuppRates element in AssocResp\n");
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index bf7461c41bef..40b75fa82b15 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1026,8 +1026,10 @@ ieee80211_parse_extension_element(u32 *crc,
 			elems->eht_operation = data;
 		break;
 	case WLAN_EID_EXT_EHT_MULTI_LINK:
-		if (ieee80211_mle_size_ok(data, len))
+		if (ieee80211_mle_size_ok(data, len)) {
 			elems->multi_link = (void *)data;
+			elems->multi_link_len = len;
+		}
 		break;
 	}
 }
@@ -1497,6 +1499,145 @@ static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len,
 	return found ? profile_len : 0;
 }
 
+static void ieee80211_defragment_element(struct ieee802_11_elems *elems,
+					 void **elem_ptr, size_t *len,
+					 size_t total_len, u8 frag_id)
+{
+	u8 *data = *elem_ptr, *pos, *start;
+	const struct element *elem;
+
+	/*
+	 * Since 'data' points to the data of the element, not the element
+	 * itself, allow 254 in case it was an extended element where the
+	 * extended ID isn't part of the data we see here and thus not part of
+	 * 'len' either.
+	 */
+	if (!data || (*len != 254 && *len != 255))
+		return;
+
+	start = elems->scratch_pos;
+
+	if (WARN_ON(*len > (elems->scratch + elems->scratch_len -
+			    elems->scratch_pos)))
+		return;
+
+	memcpy(elems->scratch_pos, data, *len);
+	elems->scratch_pos += *len;
+
+	pos = data + *len;
+	total_len -= *len;
+	for_each_element(elem, pos, total_len) {
+		if (elem->id != frag_id)
+			break;
+
+		if (WARN_ON(elem->datalen >
+			    (elems->scratch + elems->scratch_len -
+			     elems->scratch_pos)))
+			return;
+
+		memcpy(elems->scratch_pos, elem->data, elem->datalen);
+		elems->scratch_pos += elem->datalen;
+
+		*len += elem->datalen;
+	}
+
+	*elem_ptr = start;
+}
+
+static void ieee80211_mle_get_sta_prof(struct ieee802_11_elems *elems,
+				       u8 link_id)
+{
+	const struct ieee80211_multi_link_elem *ml = elems->multi_link;
+	size_t ml_len = elems->multi_link_len;
+	const struct element *sub;
+
+	if (!ml || !ml_len)
+		return;
+
+	if (le16_get_bits(ml->control, IEEE80211_ML_CONTROL_TYPE) !=
+	    IEEE80211_ML_CONTROL_TYPE_BASIC)
+		return;
+
+	for_each_mle_subelement(sub, (u8 *)ml, ml_len) {
+		struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data;
+		u16 control;
+
+		if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE)
+			continue;
+
+		if (!ieee80211_mle_sta_prof_size_ok(sub->data, sub->datalen))
+			return;
+
+		control = le16_to_cpu(prof->control);
+
+		if (link_id != u16_get_bits(control,
+					    IEEE80211_MLE_STA_CONTROL_LINK_ID))
+			continue;
+
+		if (!(control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE))
+			return;
+
+		elems->prof = prof;
+		elems->sta_prof_len = sub->datalen;
+
+		/* the sub element can be fragmented */
+		ieee80211_defragment_element(elems, (void **)&elems->prof,
+					     &elems->sta_prof_len,
+					     ml_len - (sub->data - (u8 *)ml),
+					     IEEE80211_MLE_SUBELEM_FRAGMENT);
+		return;
+	}
+}
+
+static void ieee80211_mle_parse_link(struct ieee802_11_elems *elems,
+				     struct ieee80211_elems_parse_params *params)
+{
+	struct ieee80211_mle_per_sta_profile *prof;
+	struct ieee80211_elems_parse_params sub = {
+		.action = params->action,
+		.from_ap = params->from_ap,
+		.link_id = -1,
+	};
+	const struct element *non_inherit = NULL;
+	const u8 *end;
+
+	if (params->link_id == -1)
+		return;
+
+	ieee80211_defragment_element(elems, (void **)&elems->multi_link,
+				     &elems->multi_link_len,
+				     elems->total_len - ((u8 *)elems->multi_link -
+							 elems->ie_start),
+				     WLAN_EID_FRAGMENT);
+
+	ieee80211_mle_get_sta_prof(elems, params->link_id);
+	prof = elems->prof;
+
+	if (!prof)
+		return;
+
+	/* check if we have the 4 bytes for the fixed part in assoc response */
+	if (elems->sta_prof_len < sizeof(*prof) + prof->sta_info_len - 1 + 4) {
+		elems->prof = NULL;
+		elems->sta_prof_len = 0;
+		return;
+	}
+
+	/*
+	 * Skip the capability information and the status code that are expected
+	 * as part of the station profile in association response frames. Note
+	 * the -1 is because the 'sta_info_len' is accounted to as part of the
+	 * per-STA profile, but not part of the 'u8 variable[]' portion.
+	 */
+	sub.start = prof->variable + prof->sta_info_len - 1 + 4;
+	end = (const u8 *)prof + elems->sta_prof_len;
+	sub.len = end - sub.start;
+
+	non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
+					     sub.start, sub.len);
+	_ieee802_11_parse_elems_full(&sub, elems, non_inherit);
+}
+
 struct ieee802_11_elems *
 ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
 {
@@ -1504,12 +1645,15 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
 	const struct element *non_inherit = NULL;
 	u8 *nontransmitted_profile;
 	int nontransmitted_profile_len = 0;
+	size_t scratch_len = params->scratch_len ?: 2 * params->len;
 
-	elems = kzalloc(sizeof(*elems), GFP_ATOMIC);
+	elems = kzalloc(sizeof(*elems) + scratch_len, GFP_ATOMIC);
 	if (!elems)
 		return NULL;
 	elems->ie_start = params->start;
 	elems->total_len = params->len;
+	elems->scratch_len = scratch_len;
+	elems->scratch_pos = elems->scratch;
 
 	nontransmitted_profile = kmalloc(params->len, GFP_ATOMIC);
 	if (nontransmitted_profile) {
@@ -1537,6 +1681,8 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
 		_ieee802_11_parse_elems_full(&sub, elems, NULL);
 	}
 
+	ieee80211_mle_parse_link(elems, params);
+
 	if (elems->tim && !elems->parse_error) {
 		const struct ieee80211_tim_ie *tim_ie = elems->tim;
 
-- 
cgit v1.2.3


From 1177aaa7fe9373c762cd5bf5f5de8517bac989d5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Sat, 17 Sep 2022 03:14:53 +0200
Subject: wifi: fix multi-link element subelement iteration

The subelements obviously start after the common data, including
the common multi-link element structure definition itself. This
bug was possibly just hidden by the higher bits of the control
being set to 0, so the iteration just found one bogus element
and most of the code could continue anyway.

Fixes: 0f48b8b88aa9 ("wifi: ieee80211: add definitions for multi-link element")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f51e939f28f2..6252f02f38b7 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4593,7 +4593,7 @@ static inline u8 ieee80211_mle_common_size(const u8 *data)
 		return 0;
 	}
 
-	return common + mle->variable[0];
+	return sizeof(*mle) + common + mle->variable[0];
 }
 
 /**
-- 
cgit v1.2.3


From a6a6163b9a934df5965792d60af1f53aa51fdd39 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 7 Oct 2022 10:53:03 +0200
Subject: mac802154: Introduce filtering levels

The 802154 specification details several filtering levels in which the
PHY and the MAC could be. The amount of filtering will vary if they are
in promiscuous mode or in scanning mode. Otherwise they are expected to
do some very basic checks, such as enforcing the frame validity. Either
the PHY is able to do so, and the MAC has nothing to do, or the PHY has
a lower filtering level than expected and the MAC should take over.

For now we just define these levels in an enumeration.

In a second time, we will add a per-PHY parameter showing the expected
filtering level as well as a per device current filtering level, and
will initialize all these fields.

In a third time, we will use them to apply more filtering by software
when the PHY is limited.

Indeed, if the drivers know they cannot reach the requested level of
filtering, they will overwrite the "current filtering" parameter so that
it reflects what they do. Then, in the core, the expected filtering
level will be used to decide whether some additional software processing
is needed or not.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20221007085310.503366-2-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/linux/ieee802154.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index f1f9412b6ac6..0303eb84d596 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -276,6 +276,30 @@ enum {
 	IEEE802154_SYSTEM_ERROR = 0xff,
 };
 
+/**
+ * enum ieee802154_filtering_level - Filtering levels applicable to a PHY
+ *
+ * @IEEE802154_FILTERING_NONE: No filtering at all, what is received is
+ *	forwarded to the softMAC
+ * @IEEE802154_FILTERING_1_FCS: First filtering level, frames with an invalid
+ *	FCS should be dropped
+ * @IEEE802154_FILTERING_2_PROMISCUOUS: Second filtering level, promiscuous
+ *	mode as described in the spec, identical in terms of filtering to the
+ *	level one on PHY side, but at the MAC level the frame should be
+ *	forwarded to the upper layer directly
+ * @IEEE802154_FILTERING_3_SCAN: Third filtering level, scan related, where
+ *	only beacons must be processed, all remaining traffic gets dropped
+ * @IEEE802154_FILTERING_4_FRAME_FIELDS: Fourth filtering level actually
+ *	enforcing the validity of the content of the frame with various checks
+ */
+enum ieee802154_filtering_level {
+	IEEE802154_FILTERING_NONE,
+	IEEE802154_FILTERING_1_FCS,
+	IEEE802154_FILTERING_2_PROMISCUOUS,
+	IEEE802154_FILTERING_3_SCAN,
+	IEEE802154_FILTERING_4_FRAME_FIELDS,
+};
+
 /* frame control handling */
 #define IEEE802154_FCTL_FTYPE		0x0003
 #define IEEE802154_FCTL_ACKREQ		0x0020
-- 
cgit v1.2.3


From e6c86c513f440bec5f1046539c7e3c6c653842da Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 14 Oct 2022 19:39:43 +0800
Subject: rcu-tasks: Provide rcu_trace_implies_rcu_gp()

As an accident of implementation, an RCU Tasks Trace grace period also
acts as an RCU grace period.  However, this could change at any time.
This commit therefore creates an rcu_trace_implies_rcu_gp() that currently
returns true to codify this accident.  Code relying on this accident
must call this function to verify that this accident is still happening.

Reported-by: Hou Tao <houtao@huaweicloud.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Link: https://lore.kernel.org/r/20221014113946.965131-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/rcupdate.h | 12 ++++++++++++
 kernel/rcu/tasks.h       |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 08605ce7379d..8822f06e4b40 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -240,6 +240,18 @@ static inline void exit_tasks_rcu_start(void) { }
 static inline void exit_tasks_rcu_finish(void) { }
 #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
 
+/**
+ * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
+ *
+ * As an accident of implementation, an RCU Tasks Trace grace period also
+ * acts as an RCU grace period.  However, this could change at any time.
+ * Code relying on this accident must call this function to verify that
+ * this accident is still happening.
+ *
+ * You have been warned!
+ */
+static inline bool rcu_trace_implies_rcu_gp(void) { return true; }
+
 /**
  * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
  *
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index f5bf6fb430da..9435e5a7b53e 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1535,6 +1535,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
 {
 	// Wait for late-stage exiting tasks to finish exiting.
 	// These might have passed the call to exit_tasks_rcu_finish().
+
+	// If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
 	synchronize_rcu();
 	// Any tasks that exit after this point will set
 	// TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
-- 
cgit v1.2.3


From a2fd08448f2b7591fc7ec8dec2e5e025a43f0cee Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 17 Oct 2022 14:18:26 +0200
Subject: net: remove smc911x driver

This driver was used on Arm and SH machines until 2009, when the
last platforms moved to the smsc911x driver for the same hardware.

Time to retire this version.

Link: https://lore.kernel.org/netdev/1232010482-3744-1-git-send-email-steve.glendinning@smsc.com/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20221017121900.3520108-1-arnd@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/smsc/Kconfig   |   14 -
 drivers/net/ethernet/smsc/Makefile  |    1 -
 drivers/net/ethernet/smsc/smc911x.c | 2198 -----------------------------------
 drivers/net/ethernet/smsc/smc911x.h |  901 --------------
 include/linux/smc911x.h             |   14 -
 5 files changed, 3128 deletions(-)
 delete mode 100644 drivers/net/ethernet/smsc/smc911x.c
 delete mode 100644 drivers/net/ethernet/smsc/smc911x.h
 delete mode 100644 include/linux/smc911x.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/smsc/Kconfig b/drivers/net/ethernet/smsc/Kconfig
index 2524c907f386..5f22a8a4d27b 100644
--- a/drivers/net/ethernet/smsc/Kconfig
+++ b/drivers/net/ethernet/smsc/Kconfig
@@ -75,20 +75,6 @@ config EPIC100
 	  More specific information and updates are available from
 	  <http://www.scyld.com/network/epic100.html>.
 
-config SMC911X
-	tristate "SMSC LAN911[5678] support"
-	select CRC32
-	select MII
-	depends on (ARM || SUPERH || COMPILE_TEST)
-	help
-	  This is a driver for SMSC's LAN911x series of Ethernet chipsets
-	  including the new LAN9115, LAN9116, LAN9117, and LAN9118.
-	  Say Y here if you want it compiled into the kernel.
-
-	  This driver is also available as a module. The module will be
-	  called smc911x.  If you want to compile it as a module, say M
-	  here and read <file:Documentation/kbuild/modules.rst>
-
 config SMSC911X
 	tristate "SMSC LAN911x/LAN921x families embedded ethernet support"
 	depends on HAS_IOMEM
diff --git a/drivers/net/ethernet/smsc/Makefile b/drivers/net/ethernet/smsc/Makefile
index 4105912b1629..1501fa364c13 100644
--- a/drivers/net/ethernet/smsc/Makefile
+++ b/drivers/net/ethernet/smsc/Makefile
@@ -8,5 +8,4 @@ obj-$(CONFIG_SMC91X) += smc91x.o
 obj-$(CONFIG_PCMCIA_SMC91C92) += smc91c92_cs.o
 obj-$(CONFIG_EPIC100) += epic100.o
 obj-$(CONFIG_SMSC9420) += smsc9420.o
-obj-$(CONFIG_SMC911X) += smc911x.o
 obj-$(CONFIG_SMSC911X) += smsc911x.o
diff --git a/drivers/net/ethernet/smsc/smc911x.c b/drivers/net/ethernet/smsc/smc911x.c
deleted file mode 100644
index 52ecfb461c41..000000000000
--- a/drivers/net/ethernet/smsc/smc911x.c
+++ /dev/null
@@ -1,2198 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * smc911x.c
- * This is a driver for SMSC's LAN911{5,6,7,8} single-chip Ethernet devices.
- *
- * Copyright (C) 2005 Sensoria Corp
- *	   Derived from the unified SMC91x driver by Nicolas Pitre
- *	   and the smsc911x.c reference driver by SMSC
- *
- * Arguments:
- *	 watchdog  = TX watchdog timeout
- *	 tx_fifo_kb = Size of TX FIFO in KB
- *
- * History:
- *	  04/16/05	Dustin McIntire		 Initial version
- */
-static const char version[] =
-	 "smc911x.c: v1.0 04-16-2005 by Dustin McIntire <dustin@sensoria.com>\n";
-
-/* Debugging options */
-#define ENABLE_SMC_DEBUG_RX		0
-#define ENABLE_SMC_DEBUG_TX		0
-#define ENABLE_SMC_DEBUG_DMA		0
-#define ENABLE_SMC_DEBUG_PKTS		0
-#define ENABLE_SMC_DEBUG_MISC		0
-#define ENABLE_SMC_DEBUG_FUNC		0
-
-#define SMC_DEBUG_RX		((ENABLE_SMC_DEBUG_RX	? 1 : 0) << 0)
-#define SMC_DEBUG_TX		((ENABLE_SMC_DEBUG_TX	? 1 : 0) << 1)
-#define SMC_DEBUG_DMA		((ENABLE_SMC_DEBUG_DMA	? 1 : 0) << 2)
-#define SMC_DEBUG_PKTS		((ENABLE_SMC_DEBUG_PKTS ? 1 : 0) << 3)
-#define SMC_DEBUG_MISC		((ENABLE_SMC_DEBUG_MISC ? 1 : 0) << 4)
-#define SMC_DEBUG_FUNC		((ENABLE_SMC_DEBUG_FUNC ? 1 : 0) << 5)
-
-#ifndef SMC_DEBUG
-#define SMC_DEBUG	 ( SMC_DEBUG_RX	  | \
-			   SMC_DEBUG_TX	  | \
-			   SMC_DEBUG_DMA  | \
-			   SMC_DEBUG_PKTS | \
-			   SMC_DEBUG_MISC | \
-			   SMC_DEBUG_FUNC   \
-			 )
-#endif
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/errno.h>
-#include <linux/ioport.h>
-#include <linux/crc32.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/spinlock.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/workqueue.h>
-
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
-
-#include <linux/dmaengine.h>
-
-#include <asm/io.h>
-
-#include "smc911x.h"
-
-/*
- * Transmit timeout, default 5 seconds.
- */
-static int watchdog = 5000;
-module_param(watchdog, int, 0400);
-MODULE_PARM_DESC(watchdog, "transmit timeout in milliseconds");
-
-static int tx_fifo_kb=8;
-module_param(tx_fifo_kb, int, 0400);
-MODULE_PARM_DESC(tx_fifo_kb,"transmit FIFO size in KB (1<x<15)(default=8)");
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:smc911x");
-
-/*
- * The internal workings of the driver.  If you are changing anything
- * here with the SMC stuff, you should have the datasheet and know
- * what you are doing.
- */
-#define CARDNAME "smc911x"
-
-/*
- * Use power-down feature of the chip
- */
-#define POWER_DOWN		 1
-
-#if SMC_DEBUG > 0
-#define DBG(n, dev, args...)			 \
-	do {					 \
-		if (SMC_DEBUG & (n))		 \
-			netdev_dbg(dev, args);	 \
-	} while (0)
-
-#define PRINTK(dev, args...)   netdev_info(dev, args)
-#else
-#define DBG(n, dev, args...)			 \
-	while (0) {				 \
-		netdev_dbg(dev, args);		 \
-	}
-#define PRINTK(dev, args...)   netdev_dbg(dev, args)
-#endif
-
-#if SMC_DEBUG_PKTS > 0
-static void PRINT_PKT(u_char *buf, int length)
-{
-	int i;
-	int remainder;
-	int lines;
-
-	lines = length / 16;
-	remainder = length % 16;
-
-	for (i = 0; i < lines ; i ++) {
-		int cur;
-		printk(KERN_DEBUG);
-		for (cur = 0; cur < 8; cur++) {
-			u_char a, b;
-			a = *buf++;
-			b = *buf++;
-			pr_cont("%02x%02x ", a, b);
-		}
-		pr_cont("\n");
-	}
-	printk(KERN_DEBUG);
-	for (i = 0; i < remainder/2 ; i++) {
-		u_char a, b;
-		a = *buf++;
-		b = *buf++;
-		pr_cont("%02x%02x ", a, b);
-	}
-	pr_cont("\n");
-}
-#else
-static inline void PRINT_PKT(u_char *buf, int length) { }
-#endif
-
-
-/* this enables an interrupt in the interrupt mask register */
-#define SMC_ENABLE_INT(lp, x) do {			\
-	unsigned int  __mask;				\
-	__mask = SMC_GET_INT_EN((lp));			\
-	__mask |= (x);					\
-	SMC_SET_INT_EN((lp), __mask);			\
-} while (0)
-
-/* this disables an interrupt from the interrupt mask register */
-#define SMC_DISABLE_INT(lp, x) do {			\
-	unsigned int  __mask;				\
-	__mask = SMC_GET_INT_EN((lp));			\
-	__mask &= ~(x);					\
-	SMC_SET_INT_EN((lp), __mask);			\
-} while (0)
-
-/*
- * this does a soft reset on the device
- */
-static void smc911x_reset(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned int reg, timeout=0, resets=1, irq_cfg;
-	unsigned long flags;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	/*	 Take out of PM setting first */
-	if ((SMC_GET_PMT_CTRL(lp) & PMT_CTRL_READY_) == 0) {
-		/* Write to the bytetest will take out of powerdown */
-		SMC_SET_BYTE_TEST(lp, 0);
-		timeout=10;
-		do {
-			udelay(10);
-			reg = SMC_GET_PMT_CTRL(lp) & PMT_CTRL_READY_;
-		} while (--timeout && !reg);
-		if (timeout == 0) {
-			PRINTK(dev, "smc911x_reset timeout waiting for PM restore\n");
-			return;
-		}
-	}
-
-	/* Disable all interrupts */
-	spin_lock_irqsave(&lp->lock, flags);
-	SMC_SET_INT_EN(lp, 0);
-	spin_unlock_irqrestore(&lp->lock, flags);
-
-	while (resets--) {
-		SMC_SET_HW_CFG(lp, HW_CFG_SRST_);
-		timeout=10;
-		do {
-			udelay(10);
-			reg = SMC_GET_HW_CFG(lp);
-			/* If chip indicates reset timeout then try again */
-			if (reg & HW_CFG_SRST_TO_) {
-				PRINTK(dev, "chip reset timeout, retrying...\n");
-				resets++;
-				break;
-			}
-		} while (--timeout && (reg & HW_CFG_SRST_));
-	}
-	if (timeout == 0) {
-		PRINTK(dev, "smc911x_reset timeout waiting for reset\n");
-		return;
-	}
-
-	/* make sure EEPROM has finished loading before setting GPIO_CFG */
-	timeout=1000;
-	while (--timeout && (SMC_GET_E2P_CMD(lp) & E2P_CMD_EPC_BUSY_))
-		udelay(10);
-
-	if (timeout == 0){
-		PRINTK(dev, "smc911x_reset timeout waiting for EEPROM busy\n");
-		return;
-	}
-
-	/* Initialize interrupts */
-	SMC_SET_INT_EN(lp, 0);
-	SMC_ACK_INT(lp, -1);
-
-	/* Reset the FIFO level and flow control settings */
-	SMC_SET_HW_CFG(lp, (lp->tx_fifo_kb & 0xF) << 16);
-//TODO: Figure out what appropriate pause time is
-	SMC_SET_FLOW(lp, FLOW_FCPT_ | FLOW_FCEN_);
-	SMC_SET_AFC_CFG(lp, lp->afc_cfg);
-
-
-	/* Set to LED outputs */
-	SMC_SET_GPIO_CFG(lp, 0x70070000);
-
-	/*
-	 * Deassert IRQ for 1*10us for edge type interrupts
-	 * and drive IRQ pin push-pull
-	 */
-	irq_cfg = (1 << 24) | INT_CFG_IRQ_EN_ | INT_CFG_IRQ_TYPE_;
-#ifdef SMC_DYNAMIC_BUS_CONFIG
-	if (lp->cfg.irq_polarity)
-		irq_cfg |= INT_CFG_IRQ_POL_;
-#endif
-	SMC_SET_IRQ_CFG(lp, irq_cfg);
-
-	/* clear anything saved */
-	if (lp->pending_tx_skb != NULL) {
-		dev_kfree_skb (lp->pending_tx_skb);
-		lp->pending_tx_skb = NULL;
-		dev->stats.tx_errors++;
-		dev->stats.tx_aborted_errors++;
-	}
-}
-
-/*
- * Enable Interrupts, Receive, and Transmit
- */
-static void smc911x_enable(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned mask, cfg, cr;
-	unsigned long flags;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	spin_lock_irqsave(&lp->lock, flags);
-
-	SMC_SET_MAC_ADDR(lp, dev->dev_addr);
-
-	/* Enable TX */
-	cfg = SMC_GET_HW_CFG(lp);
-	cfg &= HW_CFG_TX_FIF_SZ_ | 0xFFF;
-	cfg |= HW_CFG_SF_;
-	SMC_SET_HW_CFG(lp, cfg);
-	SMC_SET_FIFO_TDA(lp, 0xFF);
-	/* Update TX stats on every 64 packets received or every 1 sec */
-	SMC_SET_FIFO_TSL(lp, 64);
-	SMC_SET_GPT_CFG(lp, GPT_CFG_TIMER_EN_ | 10000);
-
-	SMC_GET_MAC_CR(lp, cr);
-	cr |= MAC_CR_TXEN_ | MAC_CR_HBDIS_;
-	SMC_SET_MAC_CR(lp, cr);
-	SMC_SET_TX_CFG(lp, TX_CFG_TX_ON_);
-
-	/* Add 2 byte padding to start of packets */
-	SMC_SET_RX_CFG(lp, (2<<8) & RX_CFG_RXDOFF_);
-
-	/* Turn on receiver and enable RX */
-	if (cr & MAC_CR_RXEN_)
-		DBG(SMC_DEBUG_RX, dev, "Receiver already enabled\n");
-
-	SMC_SET_MAC_CR(lp, cr | MAC_CR_RXEN_);
-
-	/* Interrupt on every received packet */
-	SMC_SET_FIFO_RSA(lp, 0x01);
-	SMC_SET_FIFO_RSL(lp, 0x00);
-
-	/* now, enable interrupts */
-	mask = INT_EN_TDFA_EN_ | INT_EN_TSFL_EN_ | INT_EN_RSFL_EN_ |
-		INT_EN_GPT_INT_EN_ | INT_EN_RXDFH_INT_EN_ | INT_EN_RXE_EN_ |
-		INT_EN_PHY_INT_EN_;
-	if (IS_REV_A(lp->revision))
-		mask|=INT_EN_RDFL_EN_;
-	else {
-		mask|=INT_EN_RDFO_EN_;
-	}
-	SMC_ENABLE_INT(lp, mask);
-
-	spin_unlock_irqrestore(&lp->lock, flags);
-}
-
-/*
- * this puts the device in an inactive state
- */
-static void smc911x_shutdown(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned cr;
-	unsigned long flags;
-
-	DBG(SMC_DEBUG_FUNC, dev, "%s: --> %s\n", CARDNAME, __func__);
-
-	/* Disable IRQ's */
-	SMC_SET_INT_EN(lp, 0);
-
-	/* Turn of Rx and TX */
-	spin_lock_irqsave(&lp->lock, flags);
-	SMC_GET_MAC_CR(lp, cr);
-	cr &= ~(MAC_CR_TXEN_ | MAC_CR_RXEN_ | MAC_CR_HBDIS_);
-	SMC_SET_MAC_CR(lp, cr);
-	SMC_SET_TX_CFG(lp, TX_CFG_STOP_TX_);
-	spin_unlock_irqrestore(&lp->lock, flags);
-}
-
-static inline void smc911x_drop_pkt(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned int fifo_count, timeout, reg;
-
-	DBG(SMC_DEBUG_FUNC | SMC_DEBUG_RX, dev, "%s: --> %s\n",
-	    CARDNAME, __func__);
-	fifo_count = SMC_GET_RX_FIFO_INF(lp) & 0xFFFF;
-	if (fifo_count <= 4) {
-		/* Manually dump the packet data */
-		while (fifo_count--)
-			SMC_GET_RX_FIFO(lp);
-	} else	 {
-		/* Fast forward through the bad packet */
-		SMC_SET_RX_DP_CTRL(lp, RX_DP_CTRL_FFWD_BUSY_);
-		timeout=50;
-		do {
-			udelay(10);
-			reg = SMC_GET_RX_DP_CTRL(lp) & RX_DP_CTRL_FFWD_BUSY_;
-		} while (--timeout && reg);
-		if (timeout == 0) {
-			PRINTK(dev, "timeout waiting for RX fast forward\n");
-		}
-	}
-}
-
-/*
- * This is the procedure to handle the receipt of a packet.
- * It should be called after checking for packet presence in
- * the RX status FIFO.	 It must be called with the spin lock
- * already held.
- */
-static inline void	 smc911x_rcv(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned int pkt_len, status;
-	struct sk_buff *skb;
-	unsigned char *data;
-
-	DBG(SMC_DEBUG_FUNC | SMC_DEBUG_RX, dev, "--> %s\n",
-	    __func__);
-	status = SMC_GET_RX_STS_FIFO(lp);
-	DBG(SMC_DEBUG_RX, dev, "Rx pkt len %d status 0x%08x\n",
-	    (status & 0x3fff0000) >> 16, status & 0xc000ffff);
-	pkt_len = (status & RX_STS_PKT_LEN_) >> 16;
-	if (status & RX_STS_ES_) {
-		/* Deal with a bad packet */
-		dev->stats.rx_errors++;
-		if (status & RX_STS_CRC_ERR_)
-			dev->stats.rx_crc_errors++;
-		else {
-			if (status & RX_STS_LEN_ERR_)
-				dev->stats.rx_length_errors++;
-			if (status & RX_STS_MCAST_)
-				dev->stats.multicast++;
-		}
-		/* Remove the bad packet data from the RX FIFO */
-		smc911x_drop_pkt(dev);
-	} else {
-		/* Receive a valid packet */
-		/* Alloc a buffer with extra room for DMA alignment */
-		skb = netdev_alloc_skb(dev, pkt_len+32);
-		if (unlikely(skb == NULL)) {
-			PRINTK(dev, "Low memory, rcvd packet dropped.\n");
-			dev->stats.rx_dropped++;
-			smc911x_drop_pkt(dev);
-			return;
-		}
-		/* Align IP header to 32 bits
-		 * Note that the device is configured to add a 2
-		 * byte padding to the packet start, so we really
-		 * want to write to the orignal data pointer */
-		data = skb->data;
-		skb_reserve(skb, 2);
-		skb_put(skb,pkt_len-4);
-#ifdef SMC_USE_DMA
-		{
-		unsigned int fifo;
-		/* Lower the FIFO threshold if possible */
-		fifo = SMC_GET_FIFO_INT(lp);
-		if (fifo & 0xFF) fifo--;
-		DBG(SMC_DEBUG_RX, dev, "Setting RX stat FIFO threshold to %d\n",
-		    fifo & 0xff);
-		SMC_SET_FIFO_INT(lp, fifo);
-		/* Setup RX DMA */
-		SMC_SET_RX_CFG(lp, RX_CFG_RX_END_ALGN16_ | ((2<<8) & RX_CFG_RXDOFF_));
-		lp->rxdma_active = 1;
-		lp->current_rx_skb = skb;
-		SMC_PULL_DATA(lp, data, (pkt_len+2+15) & ~15);
-		/* Packet processing deferred to DMA RX interrupt */
-		}
-#else
-		SMC_SET_RX_CFG(lp, RX_CFG_RX_END_ALGN4_ | ((2<<8) & RX_CFG_RXDOFF_));
-		SMC_PULL_DATA(lp, data, pkt_len+2+3);
-
-		DBG(SMC_DEBUG_PKTS, dev, "Received packet\n");
-		PRINT_PKT(data, min(pkt_len - 4, 64U));
-		skb->protocol = eth_type_trans(skb, dev);
-		netif_rx(skb);
-		dev->stats.rx_packets++;
-		dev->stats.rx_bytes += pkt_len-4;
-#endif
-	}
-}
-
-/*
- * This is called to actually send a packet to the chip.
- */
-static void smc911x_hardware_send_pkt(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	struct sk_buff *skb;
-	unsigned int cmdA, cmdB, len;
-	unsigned char *buf;
-
-	DBG(SMC_DEBUG_FUNC | SMC_DEBUG_TX, dev, "--> %s\n", __func__);
-	BUG_ON(lp->pending_tx_skb == NULL);
-
-	skb = lp->pending_tx_skb;
-	lp->pending_tx_skb = NULL;
-
-	/* cmdA {25:24] data alignment [20:16] start offset [10:0] buffer length */
-	/* cmdB {31:16] pkt tag [10:0] length */
-#ifdef SMC_USE_DMA
-	/* 16 byte buffer alignment mode */
-	buf = (char*)((u32)(skb->data) & ~0xF);
-	len = (skb->len + 0xF + ((u32)skb->data & 0xF)) & ~0xF;
-	cmdA = (1<<24) | (((u32)skb->data & 0xF)<<16) |
-			TX_CMD_A_INT_FIRST_SEG_ | TX_CMD_A_INT_LAST_SEG_ |
-			skb->len;
-#else
-	buf = (char *)((uintptr_t)skb->data & ~0x3);
-	len = (skb->len + 3 + ((uintptr_t)skb->data & 3)) & ~0x3;
-	cmdA = (((uintptr_t)skb->data & 0x3) << 16) |
-			TX_CMD_A_INT_FIRST_SEG_ | TX_CMD_A_INT_LAST_SEG_ |
-			skb->len;
-#endif
-	/* tag is packet length so we can use this in stats update later */
-	cmdB = (skb->len  << 16) | (skb->len & 0x7FF);
-
-	DBG(SMC_DEBUG_TX, dev, "TX PKT LENGTH 0x%04x (%d) BUF 0x%p CMDA 0x%08x CMDB 0x%08x\n",
-	    len, len, buf, cmdA, cmdB);
-	SMC_SET_TX_FIFO(lp, cmdA);
-	SMC_SET_TX_FIFO(lp, cmdB);
-
-	DBG(SMC_DEBUG_PKTS, dev, "Transmitted packet\n");
-	PRINT_PKT(buf, min(len, 64U));
-
-	/* Send pkt via PIO or DMA */
-#ifdef SMC_USE_DMA
-	lp->current_tx_skb = skb;
-	SMC_PUSH_DATA(lp, buf, len);
-	/* DMA complete IRQ will free buffer and set jiffies */
-#else
-	SMC_PUSH_DATA(lp, buf, len);
-	netif_trans_update(dev);
-	dev_kfree_skb_irq(skb);
-#endif
-	if (!lp->tx_throttle) {
-		netif_wake_queue(dev);
-	}
-	SMC_ENABLE_INT(lp, INT_EN_TDFA_EN_ | INT_EN_TSFL_EN_);
-}
-
-/*
- * Since I am not sure if I will have enough room in the chip's ram
- * to store the packet, I call this routine which either sends it
- * now, or set the card to generates an interrupt when ready
- * for the packet.
- */
-static netdev_tx_t
-smc911x_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned int free;
-	unsigned long flags;
-
-	DBG(SMC_DEBUG_FUNC | SMC_DEBUG_TX, dev, "--> %s\n",
-	    __func__);
-
-	spin_lock_irqsave(&lp->lock, flags);
-
-	BUG_ON(lp->pending_tx_skb != NULL);
-
-	free = SMC_GET_TX_FIFO_INF(lp) & TX_FIFO_INF_TDFREE_;
-	DBG(SMC_DEBUG_TX, dev, "TX free space %d\n", free);
-
-	/* Turn off the flow when running out of space in FIFO */
-	if (free <= SMC911X_TX_FIFO_LOW_THRESHOLD) {
-		DBG(SMC_DEBUG_TX, dev, "Disabling data flow due to low FIFO space (%d)\n",
-		    free);
-		/* Reenable when at least 1 packet of size MTU present */
-		SMC_SET_FIFO_TDA(lp, (SMC911X_TX_FIFO_LOW_THRESHOLD)/64);
-		lp->tx_throttle = 1;
-		netif_stop_queue(dev);
-	}
-
-	/* Drop packets when we run out of space in TX FIFO
-	 * Account for overhead required for:
-	 *
-	 *	  Tx command words			 8 bytes
-	 *	  Start offset				 15 bytes
-	 *	  End padding				 15 bytes
-	 */
-	if (unlikely(free < (skb->len + 8 + 15 + 15))) {
-		netdev_warn(dev, "No Tx free space %d < %d\n",
-			    free, skb->len);
-		lp->pending_tx_skb = NULL;
-		dev->stats.tx_errors++;
-		dev->stats.tx_dropped++;
-		spin_unlock_irqrestore(&lp->lock, flags);
-		dev_kfree_skb_any(skb);
-		return NETDEV_TX_OK;
-	}
-
-#ifdef SMC_USE_DMA
-	{
-		/* If the DMA is already running then defer this packet Tx until
-		 * the DMA IRQ starts it
-		 */
-		if (lp->txdma_active) {
-			DBG(SMC_DEBUG_TX | SMC_DEBUG_DMA, dev, "Tx DMA running, deferring packet\n");
-			lp->pending_tx_skb = skb;
-			netif_stop_queue(dev);
-			spin_unlock_irqrestore(&lp->lock, flags);
-			return NETDEV_TX_OK;
-		} else {
-			DBG(SMC_DEBUG_TX | SMC_DEBUG_DMA, dev, "Activating Tx DMA\n");
-			lp->txdma_active = 1;
-		}
-	}
-#endif
-	lp->pending_tx_skb = skb;
-	smc911x_hardware_send_pkt(dev);
-	spin_unlock_irqrestore(&lp->lock, flags);
-
-	return NETDEV_TX_OK;
-}
-
-/*
- * This handles a TX status interrupt, which is only called when:
- * - a TX error occurred, or
- * - TX of a packet completed.
- */
-static void smc911x_tx(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned int tx_status;
-
-	DBG(SMC_DEBUG_FUNC | SMC_DEBUG_TX, dev, "--> %s\n",
-	    __func__);
-
-	/* Collect the TX status */
-	while (((SMC_GET_TX_FIFO_INF(lp) & TX_FIFO_INF_TSUSED_) >> 16) != 0) {
-		DBG(SMC_DEBUG_TX, dev, "Tx stat FIFO used 0x%04x\n",
-		    (SMC_GET_TX_FIFO_INF(lp) & TX_FIFO_INF_TSUSED_) >> 16);
-		tx_status = SMC_GET_TX_STS_FIFO(lp);
-		dev->stats.tx_packets++;
-		dev->stats.tx_bytes+=tx_status>>16;
-		DBG(SMC_DEBUG_TX, dev, "Tx FIFO tag 0x%04x status 0x%04x\n",
-		    (tx_status & 0xffff0000) >> 16,
-		    tx_status & 0x0000ffff);
-		/* count Tx errors, but ignore lost carrier errors when in
-		 * full-duplex mode */
-		if ((tx_status & TX_STS_ES_) && !(lp->ctl_rfduplx &&
-		    !(tx_status & 0x00000306))) {
-			dev->stats.tx_errors++;
-		}
-		if (tx_status & TX_STS_MANY_COLL_) {
-			dev->stats.collisions+=16;
-			dev->stats.tx_aborted_errors++;
-		} else {
-			dev->stats.collisions+=(tx_status & TX_STS_COLL_CNT_) >> 3;
-		}
-		/* carrier error only has meaning for half-duplex communication */
-		if ((tx_status & (TX_STS_LOC_ | TX_STS_NO_CARR_)) &&
-		    !lp->ctl_rfduplx) {
-			dev->stats.tx_carrier_errors++;
-		}
-		if (tx_status & TX_STS_LATE_COLL_) {
-			dev->stats.collisions++;
-			dev->stats.tx_aborted_errors++;
-		}
-	}
-}
-
-
-/*---PHY CONTROL AND CONFIGURATION-----------------------------------------*/
-/*
- * Reads a register from the MII Management serial interface
- */
-
-static int smc911x_phy_read(struct net_device *dev, int phyaddr, int phyreg)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned int phydata;
-
-	SMC_GET_MII(lp, phyreg, phyaddr, phydata);
-
-	DBG(SMC_DEBUG_MISC, dev, "%s: phyaddr=0x%x, phyreg=0x%02x, phydata=0x%04x\n",
-	    __func__, phyaddr, phyreg, phydata);
-	return phydata;
-}
-
-
-/*
- * Writes a register to the MII Management serial interface
- */
-static void smc911x_phy_write(struct net_device *dev, int phyaddr, int phyreg,
-			int phydata)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-
-	DBG(SMC_DEBUG_MISC, dev, "%s: phyaddr=0x%x, phyreg=0x%x, phydata=0x%x\n",
-	    __func__, phyaddr, phyreg, phydata);
-
-	SMC_SET_MII(lp, phyreg, phyaddr, phydata);
-}
-
-/*
- * Finds and reports the PHY address (115 and 117 have external
- * PHY interface 118 has internal only
- */
-static void smc911x_phy_detect(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int phyaddr;
-	unsigned int cfg, id1, id2;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	lp->phy_type = 0;
-
-	/*
-	 * Scan all 32 PHY addresses if necessary, starting at
-	 * PHY#1 to PHY#31, and then PHY#0 last.
-	 */
-	switch(lp->version) {
-		case CHIP_9115:
-		case CHIP_9117:
-		case CHIP_9215:
-		case CHIP_9217:
-			cfg = SMC_GET_HW_CFG(lp);
-			if (cfg & HW_CFG_EXT_PHY_DET_) {
-				cfg &= ~HW_CFG_PHY_CLK_SEL_;
-				cfg |= HW_CFG_PHY_CLK_SEL_CLK_DIS_;
-				SMC_SET_HW_CFG(lp, cfg);
-				udelay(10); /* Wait for clocks to stop */
-
-				cfg |= HW_CFG_EXT_PHY_EN_;
-				SMC_SET_HW_CFG(lp, cfg);
-				udelay(10); /* Wait for clocks to stop */
-
-				cfg &= ~HW_CFG_PHY_CLK_SEL_;
-				cfg |= HW_CFG_PHY_CLK_SEL_EXT_PHY_;
-				SMC_SET_HW_CFG(lp, cfg);
-				udelay(10); /* Wait for clocks to stop */
-
-				cfg |= HW_CFG_SMI_SEL_;
-				SMC_SET_HW_CFG(lp, cfg);
-
-				for (phyaddr = 1; phyaddr < 32; ++phyaddr) {
-
-					/* Read the PHY identifiers */
-					SMC_GET_PHY_ID1(lp, phyaddr & 31, id1);
-					SMC_GET_PHY_ID2(lp, phyaddr & 31, id2);
-
-					/* Make sure it is a valid identifier */
-					if (id1 != 0x0000 && id1 != 0xffff &&
-					    id1 != 0x8000 && id2 != 0x0000 &&
-					    id2 != 0xffff && id2 != 0x8000) {
-						/* Save the PHY's address */
-						lp->mii.phy_id = phyaddr & 31;
-						lp->phy_type = id1 << 16 | id2;
-						break;
-					}
-				}
-				if (phyaddr < 32)
-					/* Found an external PHY */
-					break;
-			}
-			fallthrough;
-		default:
-			/* Internal media only */
-			SMC_GET_PHY_ID1(lp, 1, id1);
-			SMC_GET_PHY_ID2(lp, 1, id2);
-			/* Save the PHY's address */
-			lp->mii.phy_id = 1;
-			lp->phy_type = id1 << 16 | id2;
-	}
-
-	DBG(SMC_DEBUG_MISC, dev, "phy_id1=0x%x, phy_id2=0x%x phyaddr=0x%x\n",
-	    id1, id2, lp->mii.phy_id);
-}
-
-/*
- * Sets the PHY to a configuration as determined by the user.
- * Called with spin_lock held.
- */
-static int smc911x_phy_fixed(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int phyaddr = lp->mii.phy_id;
-	int bmcr;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	/* Enter Link Disable state */
-	SMC_GET_PHY_BMCR(lp, phyaddr, bmcr);
-	bmcr |= BMCR_PDOWN;
-	SMC_SET_PHY_BMCR(lp, phyaddr, bmcr);
-
-	/*
-	 * Set our fixed capabilities
-	 * Disable auto-negotiation
-	 */
-	bmcr &= ~BMCR_ANENABLE;
-	if (lp->ctl_rfduplx)
-		bmcr |= BMCR_FULLDPLX;
-
-	if (lp->ctl_rspeed == 100)
-		bmcr |= BMCR_SPEED100;
-
-	/* Write our capabilities to the phy control register */
-	SMC_SET_PHY_BMCR(lp, phyaddr, bmcr);
-
-	/* Re-Configure the Receive/Phy Control register */
-	bmcr &= ~BMCR_PDOWN;
-	SMC_SET_PHY_BMCR(lp, phyaddr, bmcr);
-
-	return 1;
-}
-
-/**
- * smc911x_phy_reset - reset the phy
- * @dev: net device
- * @phy: phy address
- *
- * Issue a software reset for the specified PHY and
- * wait up to 100ms for the reset to complete.	 We should
- * not access the PHY for 50ms after issuing the reset.
- *
- * The time to wait appears to be dependent on the PHY.
- *
- */
-static int smc911x_phy_reset(struct net_device *dev, int phy)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int timeout;
-	unsigned long flags;
-	unsigned int reg;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s()\n", __func__);
-
-	spin_lock_irqsave(&lp->lock, flags);
-	reg = SMC_GET_PMT_CTRL(lp);
-	reg &= ~0xfffff030;
-	reg |= PMT_CTRL_PHY_RST_;
-	SMC_SET_PMT_CTRL(lp, reg);
-	spin_unlock_irqrestore(&lp->lock, flags);
-	for (timeout = 2; timeout; timeout--) {
-		msleep(50);
-		spin_lock_irqsave(&lp->lock, flags);
-		reg = SMC_GET_PMT_CTRL(lp);
-		spin_unlock_irqrestore(&lp->lock, flags);
-		if (!(reg & PMT_CTRL_PHY_RST_)) {
-			/* extra delay required because the phy may
-			 * not be completed with its reset
-			 * when PHY_BCR_RESET_ is cleared. 256us
-			 * should suffice, but use 500us to be safe
-			 */
-			udelay(500);
-		break;
-		}
-	}
-
-	return reg & PMT_CTRL_PHY_RST_;
-}
-
-/**
- * smc911x_phy_powerdown - powerdown phy
- * @dev: net device
- * @phy: phy address
- *
- * Power down the specified PHY
- */
-static void smc911x_phy_powerdown(struct net_device *dev, int phy)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned int bmcr;
-
-	/* Enter Link Disable state */
-	SMC_GET_PHY_BMCR(lp, phy, bmcr);
-	bmcr |= BMCR_PDOWN;
-	SMC_SET_PHY_BMCR(lp, phy, bmcr);
-}
-
-/**
- * smc911x_phy_check_media - check the media status and adjust BMCR
- * @dev: net device
- * @init: set true for initialisation
- *
- * Select duplex mode depending on negotiation state.	This
- * also updates our carrier state.
- */
-static void smc911x_phy_check_media(struct net_device *dev, int init)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int phyaddr = lp->mii.phy_id;
-	unsigned int bmcr, cr;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	if (mii_check_media(&lp->mii, netif_msg_link(lp), init)) {
-		/* duplex state has changed */
-		SMC_GET_PHY_BMCR(lp, phyaddr, bmcr);
-		SMC_GET_MAC_CR(lp, cr);
-		if (lp->mii.full_duplex) {
-			DBG(SMC_DEBUG_MISC, dev, "Configuring for full-duplex mode\n");
-			bmcr |= BMCR_FULLDPLX;
-			cr |= MAC_CR_RCVOWN_;
-		} else {
-			DBG(SMC_DEBUG_MISC, dev, "Configuring for half-duplex mode\n");
-			bmcr &= ~BMCR_FULLDPLX;
-			cr &= ~MAC_CR_RCVOWN_;
-		}
-		SMC_SET_PHY_BMCR(lp, phyaddr, bmcr);
-		SMC_SET_MAC_CR(lp, cr);
-	}
-}
-
-/*
- * Configures the specified PHY through the MII management interface
- * using Autonegotiation.
- * Calls smc911x_phy_fixed() if the user has requested a certain config.
- * If RPC ANEG bit is set, the media selection is dependent purely on
- * the selection by the MII (either in the MII BMCR reg or the result
- * of autonegotiation.)  If the RPC ANEG bit is cleared, the selection
- * is controlled by the RPC SPEED and RPC DPLX bits.
- */
-static void smc911x_phy_configure(struct work_struct *work)
-{
-	struct smc911x_local *lp = container_of(work, struct smc911x_local,
-						phy_configure);
-	struct net_device *dev = lp->netdev;
-	int phyaddr = lp->mii.phy_id;
-	int my_phy_caps; /* My PHY capabilities */
-	int my_ad_caps; /* My Advertised capabilities */
-	int status __always_unused;
-	unsigned long flags;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s()\n", __func__);
-
-	/*
-	 * We should not be called if phy_type is zero.
-	 */
-	if (lp->phy_type == 0)
-		return;
-
-	if (smc911x_phy_reset(dev, phyaddr)) {
-		netdev_info(dev, "PHY reset timed out\n");
-		return;
-	}
-	spin_lock_irqsave(&lp->lock, flags);
-
-	/*
-	 * Enable PHY Interrupts (for register 18)
-	 * Interrupts listed here are enabled
-	 */
-	SMC_SET_PHY_INT_MASK(lp, phyaddr, PHY_INT_MASK_ENERGY_ON_ |
-		 PHY_INT_MASK_ANEG_COMP_ | PHY_INT_MASK_REMOTE_FAULT_ |
-		 PHY_INT_MASK_LINK_DOWN_);
-
-	/* If the user requested no auto neg, then go set his request */
-	if (lp->mii.force_media) {
-		smc911x_phy_fixed(dev);
-		goto smc911x_phy_configure_exit;
-	}
-
-	/* Copy our capabilities from MII_BMSR to MII_ADVERTISE */
-	SMC_GET_PHY_BMSR(lp, phyaddr, my_phy_caps);
-	if (!(my_phy_caps & BMSR_ANEGCAPABLE)) {
-		netdev_info(dev, "Auto negotiation NOT supported\n");
-		smc911x_phy_fixed(dev);
-		goto smc911x_phy_configure_exit;
-	}
-
-	/* CSMA capable w/ both pauses */
-	my_ad_caps = ADVERTISE_CSMA | ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM;
-
-	if (my_phy_caps & BMSR_100BASE4)
-		my_ad_caps |= ADVERTISE_100BASE4;
-	if (my_phy_caps & BMSR_100FULL)
-		my_ad_caps |= ADVERTISE_100FULL;
-	if (my_phy_caps & BMSR_100HALF)
-		my_ad_caps |= ADVERTISE_100HALF;
-	if (my_phy_caps & BMSR_10FULL)
-		my_ad_caps |= ADVERTISE_10FULL;
-	if (my_phy_caps & BMSR_10HALF)
-		my_ad_caps |= ADVERTISE_10HALF;
-
-	/* Disable capabilities not selected by our user */
-	if (lp->ctl_rspeed != 100)
-		my_ad_caps &= ~(ADVERTISE_100BASE4|ADVERTISE_100FULL|ADVERTISE_100HALF);
-
-	if (!lp->ctl_rfduplx)
-		my_ad_caps &= ~(ADVERTISE_100FULL|ADVERTISE_10FULL);
-
-	/* Update our Auto-Neg Advertisement Register */
-	SMC_SET_PHY_MII_ADV(lp, phyaddr, my_ad_caps);
-	lp->mii.advertising = my_ad_caps;
-
-	/*
-	 * Read the register back.	 Without this, it appears that when
-	 * auto-negotiation is restarted, sometimes it isn't ready and
-	 * the link does not come up.
-	 */
-	udelay(10);
-	SMC_GET_PHY_MII_ADV(lp, phyaddr, status);
-
-	DBG(SMC_DEBUG_MISC, dev, "phy caps=0x%04x\n", my_phy_caps);
-	DBG(SMC_DEBUG_MISC, dev, "phy advertised caps=0x%04x\n", my_ad_caps);
-
-	/* Restart auto-negotiation process in order to advertise my caps */
-	SMC_SET_PHY_BMCR(lp, phyaddr, BMCR_ANENABLE | BMCR_ANRESTART);
-
-	smc911x_phy_check_media(dev, 1);
-
-smc911x_phy_configure_exit:
-	spin_unlock_irqrestore(&lp->lock, flags);
-}
-
-/*
- * smc911x_phy_interrupt
- *
- * Purpose:  Handle interrupts relating to PHY register 18. This is
- *	 called from the "hard" interrupt handler under our private spinlock.
- */
-static void smc911x_phy_interrupt(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int phyaddr = lp->mii.phy_id;
-	int status __always_unused;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	if (lp->phy_type == 0)
-		return;
-
-	smc911x_phy_check_media(dev, 0);
-	/* read to clear status bits */
-	SMC_GET_PHY_INT_SRC(lp, phyaddr,status);
-	DBG(SMC_DEBUG_MISC, dev, "PHY interrupt status 0x%04x\n",
-	    status & 0xffff);
-	DBG(SMC_DEBUG_MISC, dev, "AFC_CFG 0x%08x\n",
-	    SMC_GET_AFC_CFG(lp));
-}
-
-/*--- END PHY CONTROL AND CONFIGURATION-------------------------------------*/
-
-/*
- * This is the main routine of the driver, to handle the device when
- * it needs some attention.
- */
-static irqreturn_t smc911x_interrupt(int irq, void *dev_id)
-{
-	struct net_device *dev = dev_id;
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned int status, mask, timeout;
-	unsigned int rx_overrun=0, cr, pkts;
-	unsigned long flags;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	spin_lock_irqsave(&lp->lock, flags);
-
-	/* Spurious interrupt check */
-	if ((SMC_GET_IRQ_CFG(lp) & (INT_CFG_IRQ_INT_ | INT_CFG_IRQ_EN_)) !=
-		(INT_CFG_IRQ_INT_ | INT_CFG_IRQ_EN_)) {
-		spin_unlock_irqrestore(&lp->lock, flags);
-		return IRQ_NONE;
-	}
-
-	mask = SMC_GET_INT_EN(lp);
-	SMC_SET_INT_EN(lp, 0);
-
-	/* set a timeout value, so I don't stay here forever */
-	timeout = 8;
-
-
-	do {
-		status = SMC_GET_INT(lp);
-
-		DBG(SMC_DEBUG_MISC, dev, "INT 0x%08x MASK 0x%08x OUTSIDE MASK 0x%08x\n",
-		    status, mask, status & ~mask);
-
-		status &= mask;
-		if (!status)
-			break;
-
-		/* Handle SW interrupt condition */
-		if (status & INT_STS_SW_INT_) {
-			SMC_ACK_INT(lp, INT_STS_SW_INT_);
-			mask &= ~INT_EN_SW_INT_EN_;
-		}
-		/* Handle various error conditions */
-		if (status & INT_STS_RXE_) {
-			SMC_ACK_INT(lp, INT_STS_RXE_);
-			dev->stats.rx_errors++;
-		}
-		if (status & INT_STS_RXDFH_INT_) {
-			SMC_ACK_INT(lp, INT_STS_RXDFH_INT_);
-			dev->stats.rx_dropped+=SMC_GET_RX_DROP(lp);
-		 }
-		/* Undocumented interrupt-what is the right thing to do here? */
-		if (status & INT_STS_RXDF_INT_) {
-			SMC_ACK_INT(lp, INT_STS_RXDF_INT_);
-		}
-
-		/* Rx Data FIFO exceeds set level */
-		if (status & INT_STS_RDFL_) {
-			if (IS_REV_A(lp->revision)) {
-				rx_overrun=1;
-				SMC_GET_MAC_CR(lp, cr);
-				cr &= ~MAC_CR_RXEN_;
-				SMC_SET_MAC_CR(lp, cr);
-				DBG(SMC_DEBUG_RX, dev, "RX overrun\n");
-				dev->stats.rx_errors++;
-				dev->stats.rx_fifo_errors++;
-			}
-			SMC_ACK_INT(lp, INT_STS_RDFL_);
-		}
-		if (status & INT_STS_RDFO_) {
-			if (!IS_REV_A(lp->revision)) {
-				SMC_GET_MAC_CR(lp, cr);
-				cr &= ~MAC_CR_RXEN_;
-				SMC_SET_MAC_CR(lp, cr);
-				rx_overrun=1;
-				DBG(SMC_DEBUG_RX, dev, "RX overrun\n");
-				dev->stats.rx_errors++;
-				dev->stats.rx_fifo_errors++;
-			}
-			SMC_ACK_INT(lp, INT_STS_RDFO_);
-		}
-		/* Handle receive condition */
-		if ((status & INT_STS_RSFL_) || rx_overrun) {
-			unsigned int fifo;
-			DBG(SMC_DEBUG_RX, dev, "RX irq\n");
-			fifo = SMC_GET_RX_FIFO_INF(lp);
-			pkts = (fifo & RX_FIFO_INF_RXSUSED_) >> 16;
-			DBG(SMC_DEBUG_RX, dev, "Rx FIFO pkts %d, bytes %d\n",
-			    pkts, fifo & 0xFFFF);
-			if (pkts != 0) {
-#ifdef SMC_USE_DMA
-				unsigned int fifo;
-				if (lp->rxdma_active){
-					DBG(SMC_DEBUG_RX | SMC_DEBUG_DMA, dev,
-					    "RX DMA active\n");
-					/* The DMA is already running so up the IRQ threshold */
-					fifo = SMC_GET_FIFO_INT(lp) & ~0xFF;
-					fifo |= pkts & 0xFF;
-					DBG(SMC_DEBUG_RX, dev,
-					    "Setting RX stat FIFO threshold to %d\n",
-					    fifo & 0xff);
-					SMC_SET_FIFO_INT(lp, fifo);
-				} else
-#endif
-				smc911x_rcv(dev);
-			}
-			SMC_ACK_INT(lp, INT_STS_RSFL_);
-		}
-		/* Handle transmit FIFO available */
-		if (status & INT_STS_TDFA_) {
-			DBG(SMC_DEBUG_TX, dev, "TX data FIFO space available irq\n");
-			SMC_SET_FIFO_TDA(lp, 0xFF);
-			lp->tx_throttle = 0;
-#ifdef SMC_USE_DMA
-			if (!lp->txdma_active)
-#endif
-				netif_wake_queue(dev);
-			SMC_ACK_INT(lp, INT_STS_TDFA_);
-		}
-		/* Handle transmit done condition */
-#if 1
-		if (status & (INT_STS_TSFL_ | INT_STS_GPT_INT_)) {
-			DBG(SMC_DEBUG_TX | SMC_DEBUG_MISC, dev,
-			    "Tx stat FIFO limit (%d) /GPT irq\n",
-			    (SMC_GET_FIFO_INT(lp) & 0x00ff0000) >> 16);
-			smc911x_tx(dev);
-			SMC_SET_GPT_CFG(lp, GPT_CFG_TIMER_EN_ | 10000);
-			SMC_ACK_INT(lp, INT_STS_TSFL_);
-			SMC_ACK_INT(lp, INT_STS_TSFL_ | INT_STS_GPT_INT_);
-		}
-#else
-		if (status & INT_STS_TSFL_) {
-			DBG(SMC_DEBUG_TX, dev, "TX status FIFO limit (%d) irq\n", ?);
-			smc911x_tx(dev);
-			SMC_ACK_INT(lp, INT_STS_TSFL_);
-		}
-
-		if (status & INT_STS_GPT_INT_) {
-			DBG(SMC_DEBUG_RX, dev, "IRQ_CFG 0x%08x FIFO_INT 0x%08x RX_CFG 0x%08x\n",
-			    SMC_GET_IRQ_CFG(lp),
-			    SMC_GET_FIFO_INT(lp),
-			    SMC_GET_RX_CFG(lp));
-			DBG(SMC_DEBUG_RX, dev, "Rx Stat FIFO Used 0x%02x Data FIFO Used 0x%04x Stat FIFO 0x%08x\n",
-			    (SMC_GET_RX_FIFO_INF(lp) & 0x00ff0000) >> 16,
-			    SMC_GET_RX_FIFO_INF(lp) & 0xffff,
-			    SMC_GET_RX_STS_FIFO_PEEK(lp));
-			SMC_SET_GPT_CFG(lp, GPT_CFG_TIMER_EN_ | 10000);
-			SMC_ACK_INT(lp, INT_STS_GPT_INT_);
-		}
-#endif
-
-		/* Handle PHY interrupt condition */
-		if (status & INT_STS_PHY_INT_) {
-			DBG(SMC_DEBUG_MISC, dev, "PHY irq\n");
-			smc911x_phy_interrupt(dev);
-			SMC_ACK_INT(lp, INT_STS_PHY_INT_);
-		}
-	} while (--timeout);
-
-	/* restore mask state */
-	SMC_SET_INT_EN(lp, mask);
-
-	DBG(SMC_DEBUG_MISC, dev, "Interrupt done (%d loops)\n",
-	    8-timeout);
-
-	spin_unlock_irqrestore(&lp->lock, flags);
-
-	return IRQ_HANDLED;
-}
-
-#ifdef SMC_USE_DMA
-static void
-smc911x_tx_dma_irq(void *data)
-{
-	struct smc911x_local *lp = data;
-	struct net_device *dev = lp->netdev;
-	struct sk_buff *skb = lp->current_tx_skb;
-	unsigned long flags;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	DBG(SMC_DEBUG_TX | SMC_DEBUG_DMA, dev, "TX DMA irq handler\n");
-	BUG_ON(skb == NULL);
-	dma_unmap_single(lp->dev, tx_dmabuf, tx_dmalen, DMA_TO_DEVICE);
-	netif_trans_update(dev);
-	dev_kfree_skb_irq(skb);
-	lp->current_tx_skb = NULL;
-	if (lp->pending_tx_skb != NULL)
-		smc911x_hardware_send_pkt(dev);
-	else {
-		DBG(SMC_DEBUG_TX | SMC_DEBUG_DMA, dev,
-		    "No pending Tx packets. DMA disabled\n");
-		spin_lock_irqsave(&lp->lock, flags);
-		lp->txdma_active = 0;
-		if (!lp->tx_throttle) {
-			netif_wake_queue(dev);
-		}
-		spin_unlock_irqrestore(&lp->lock, flags);
-	}
-
-	DBG(SMC_DEBUG_TX | SMC_DEBUG_DMA, dev,
-	    "TX DMA irq completed\n");
-}
-static void
-smc911x_rx_dma_irq(void *data)
-{
-	struct smc911x_local *lp = data;
-	struct net_device *dev = lp->netdev;
-	struct sk_buff *skb = lp->current_rx_skb;
-	unsigned long flags;
-	unsigned int pkts;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-	DBG(SMC_DEBUG_RX | SMC_DEBUG_DMA, dev, "RX DMA irq handler\n");
-	dma_unmap_single(lp->dev, rx_dmabuf, rx_dmalen, DMA_FROM_DEVICE);
-	BUG_ON(skb == NULL);
-	lp->current_rx_skb = NULL;
-	PRINT_PKT(skb->data, skb->len);
-	skb->protocol = eth_type_trans(skb, dev);
-	dev->stats.rx_packets++;
-	dev->stats.rx_bytes += skb->len;
-	netif_rx(skb);
-
-	spin_lock_irqsave(&lp->lock, flags);
-	pkts = (SMC_GET_RX_FIFO_INF(lp) & RX_FIFO_INF_RXSUSED_) >> 16;
-	if (pkts != 0) {
-		smc911x_rcv(dev);
-	}else {
-		lp->rxdma_active = 0;
-	}
-	spin_unlock_irqrestore(&lp->lock, flags);
-	DBG(SMC_DEBUG_RX | SMC_DEBUG_DMA, dev,
-	    "RX DMA irq completed. DMA RX FIFO PKTS %d\n",
-	    pkts);
-}
-#endif	 /* SMC_USE_DMA */
-
-#ifdef CONFIG_NET_POLL_CONTROLLER
-/*
- * Polling receive - used by netconsole and other diagnostic tools
- * to allow network i/o with interrupts disabled.
- */
-static void smc911x_poll_controller(struct net_device *dev)
-{
-	disable_irq(dev->irq);
-	smc911x_interrupt(dev->irq, dev);
-	enable_irq(dev->irq);
-}
-#endif
-
-/* Our watchdog timed out. Called by the networking layer */
-static void smc911x_timeout(struct net_device *dev, unsigned int txqueue)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int status, mask;
-	unsigned long flags;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	spin_lock_irqsave(&lp->lock, flags);
-	status = SMC_GET_INT(lp);
-	mask = SMC_GET_INT_EN(lp);
-	spin_unlock_irqrestore(&lp->lock, flags);
-	DBG(SMC_DEBUG_MISC, dev, "INT 0x%02x MASK 0x%02x\n",
-	    status, mask);
-
-	/* Dump the current TX FIFO contents and restart */
-	mask = SMC_GET_TX_CFG(lp);
-	SMC_SET_TX_CFG(lp, mask | TX_CFG_TXS_DUMP_ | TX_CFG_TXD_DUMP_);
-	/*
-	 * Reconfiguring the PHY doesn't seem like a bad idea here, but
-	 * smc911x_phy_configure() calls msleep() which calls schedule_timeout()
-	 * which calls schedule().	 Hence we use a work queue.
-	 */
-	if (lp->phy_type != 0)
-		schedule_work(&lp->phy_configure);
-
-	/* We can accept TX packets again */
-	netif_trans_update(dev); /* prevent tx timeout */
-	netif_wake_queue(dev);
-}
-
-/*
- * This routine will, depending on the values passed to it,
- * either make it accept multicast packets, go into
- * promiscuous mode (for TCPDUMP and cousins) or accept
- * a select set of multicast packets
- */
-static void smc911x_set_multicast_list(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned int multicast_table[2];
-	unsigned int mcr, update_multicast = 0;
-	unsigned long flags;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	spin_lock_irqsave(&lp->lock, flags);
-	SMC_GET_MAC_CR(lp, mcr);
-	spin_unlock_irqrestore(&lp->lock, flags);
-
-	if (dev->flags & IFF_PROMISC) {
-
-		DBG(SMC_DEBUG_MISC, dev, "RCR_PRMS\n");
-		mcr |= MAC_CR_PRMS_;
-	}
-	/*
-	 * Here, I am setting this to accept all multicast packets.
-	 * I don't need to zero the multicast table, because the flag is
-	 * checked before the table is
-	 */
-	else if (dev->flags & IFF_ALLMULTI || netdev_mc_count(dev) > 16) {
-		DBG(SMC_DEBUG_MISC, dev, "RCR_ALMUL\n");
-		mcr |= MAC_CR_MCPAS_;
-	}
-
-	/*
-	 * This sets the internal hardware table to filter out unwanted
-	 * multicast packets before they take up memory.
-	 *
-	 * The SMC chip uses a hash table where the high 6 bits of the CRC of
-	 * address are the offset into the table.	If that bit is 1, then the
-	 * multicast packet is accepted.  Otherwise, it's dropped silently.
-	 *
-	 * To use the 6 bits as an offset into the table, the high 1 bit is
-	 * the number of the 32 bit register, while the low 5 bits are the bit
-	 * within that register.
-	 */
-	else if (!netdev_mc_empty(dev)) {
-		struct netdev_hw_addr *ha;
-
-		/* Set the Hash perfec mode */
-		mcr |= MAC_CR_HPFILT_;
-
-		/* start with a table of all zeros: reject all */
-		memset(multicast_table, 0, sizeof(multicast_table));
-
-		netdev_for_each_mc_addr(ha, dev) {
-			u32 position;
-
-			/* upper 6 bits are used as hash index */
-			position = ether_crc(ETH_ALEN, ha->addr)>>26;
-
-			multicast_table[position>>5] |= 1 << (position&0x1f);
-		}
-
-		/* be sure I get rid of flags I might have set */
-		mcr &= ~(MAC_CR_PRMS_ | MAC_CR_MCPAS_);
-
-		/* now, the table can be loaded into the chipset */
-		update_multicast = 1;
-	} else	 {
-		DBG(SMC_DEBUG_MISC, dev, "~(MAC_CR_PRMS_|MAC_CR_MCPAS_)\n");
-		mcr &= ~(MAC_CR_PRMS_ | MAC_CR_MCPAS_);
-
-		/*
-		 * since I'm disabling all multicast entirely, I need to
-		 * clear the multicast list
-		 */
-		memset(multicast_table, 0, sizeof(multicast_table));
-		update_multicast = 1;
-	}
-
-	spin_lock_irqsave(&lp->lock, flags);
-	SMC_SET_MAC_CR(lp, mcr);
-	if (update_multicast) {
-		DBG(SMC_DEBUG_MISC, dev,
-		    "update mcast hash table 0x%08x 0x%08x\n",
-		    multicast_table[0], multicast_table[1]);
-		SMC_SET_HASHL(lp, multicast_table[0]);
-		SMC_SET_HASHH(lp, multicast_table[1]);
-	}
-	spin_unlock_irqrestore(&lp->lock, flags);
-}
-
-
-/*
- * Open and Initialize the board
- *
- * Set up everything, reset the card, etc..
- */
-static int
-smc911x_open(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	/* reset the hardware */
-	smc911x_reset(dev);
-
-	/* Configure the PHY, initialize the link state */
-	smc911x_phy_configure(&lp->phy_configure);
-
-	/* Turn on Tx + Rx */
-	smc911x_enable(dev);
-
-	netif_start_queue(dev);
-
-	return 0;
-}
-
-/*
- * smc911x_close
- *
- * this makes the board clean up everything that it can
- * and not talk to the outside world.	 Caused by
- * an 'ifconfig ethX down'
- */
-static int smc911x_close(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	netif_stop_queue(dev);
-	netif_carrier_off(dev);
-
-	/* clear everything */
-	smc911x_shutdown(dev);
-
-	if (lp->phy_type != 0) {
-		/* We need to ensure that no calls to
-		 * smc911x_phy_configure are pending.
-		 */
-		cancel_work_sync(&lp->phy_configure);
-		smc911x_phy_powerdown(dev, lp->mii.phy_id);
-	}
-
-	if (lp->pending_tx_skb) {
-		dev_kfree_skb(lp->pending_tx_skb);
-		lp->pending_tx_skb = NULL;
-	}
-
-	return 0;
-}
-
-/*
- * Ethtool support
- */
-static int
-smc911x_ethtool_get_link_ksettings(struct net_device *dev,
-				   struct ethtool_link_ksettings *cmd)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int status;
-	unsigned long flags;
-	u32 supported;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	if (lp->phy_type != 0) {
-		spin_lock_irqsave(&lp->lock, flags);
-		mii_ethtool_get_link_ksettings(&lp->mii, cmd);
-		spin_unlock_irqrestore(&lp->lock, flags);
-	} else {
-		supported = SUPPORTED_10baseT_Half |
-				SUPPORTED_10baseT_Full |
-				SUPPORTED_TP | SUPPORTED_AUI;
-
-		if (lp->ctl_rspeed == 10)
-			cmd->base.speed = SPEED_10;
-		else if (lp->ctl_rspeed == 100)
-			cmd->base.speed = SPEED_100;
-
-		cmd->base.autoneg = AUTONEG_DISABLE;
-		cmd->base.port = 0;
-		SMC_GET_PHY_SPECIAL(lp, lp->mii.phy_id, status);
-		cmd->base.duplex =
-			(status & (PHY_SPECIAL_SPD_10FULL_ | PHY_SPECIAL_SPD_100FULL_)) ?
-				DUPLEX_FULL : DUPLEX_HALF;
-
-		ethtool_convert_legacy_u32_to_link_mode(
-			cmd->link_modes.supported, supported);
-
-	}
-
-	return 0;
-}
-
-static int
-smc911x_ethtool_set_link_ksettings(struct net_device *dev,
-				   const struct ethtool_link_ksettings *cmd)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int ret;
-	unsigned long flags;
-
-	if (lp->phy_type != 0) {
-		spin_lock_irqsave(&lp->lock, flags);
-		ret = mii_ethtool_set_link_ksettings(&lp->mii, cmd);
-		spin_unlock_irqrestore(&lp->lock, flags);
-	} else {
-		if (cmd->base.autoneg != AUTONEG_DISABLE ||
-		    cmd->base.speed != SPEED_10 ||
-		    (cmd->base.duplex != DUPLEX_HALF &&
-		     cmd->base.duplex != DUPLEX_FULL) ||
-		    (cmd->base.port != PORT_TP &&
-		     cmd->base.port != PORT_AUI))
-			return -EINVAL;
-
-		lp->ctl_rfduplx = cmd->base.duplex == DUPLEX_FULL;
-
-		ret = 0;
-	}
-
-	return ret;
-}
-
-static void
-smc911x_ethtool_getdrvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
-{
-	strscpy(info->driver, CARDNAME, sizeof(info->driver));
-	strscpy(info->version, version, sizeof(info->version));
-	strscpy(info->bus_info, dev_name(dev->dev.parent),
-		sizeof(info->bus_info));
-}
-
-static int smc911x_ethtool_nwayreset(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int ret = -EINVAL;
-	unsigned long flags;
-
-	if (lp->phy_type != 0) {
-		spin_lock_irqsave(&lp->lock, flags);
-		ret = mii_nway_restart(&lp->mii);
-		spin_unlock_irqrestore(&lp->lock, flags);
-	}
-
-	return ret;
-}
-
-static u32 smc911x_ethtool_getmsglevel(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	return lp->msg_enable;
-}
-
-static void smc911x_ethtool_setmsglevel(struct net_device *dev, u32 level)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	lp->msg_enable = level;
-}
-
-static int smc911x_ethtool_getregslen(struct net_device *dev)
-{
-	/* System regs + MAC regs + PHY regs */
-	return (((E2P_CMD - ID_REV)/4 + 1) +
-			(WUCSR - MAC_CR)+1 + 32) * sizeof(u32);
-}
-
-static void smc911x_ethtool_getregs(struct net_device *dev,
-				    struct ethtool_regs *regs, void *buf)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned long flags;
-	u32 reg,i,j=0;
-	u32 *data = (u32*)buf;
-
-	regs->version = lp->version;
-	for(i=ID_REV;i<=E2P_CMD;i+=4) {
-		data[j++] = SMC_inl(lp, i);
-	}
-	for(i=MAC_CR;i<=WUCSR;i++) {
-		spin_lock_irqsave(&lp->lock, flags);
-		SMC_GET_MAC_CSR(lp, i, reg);
-		spin_unlock_irqrestore(&lp->lock, flags);
-		data[j++] = reg;
-	}
-	for(i=0;i<=31;i++) {
-		spin_lock_irqsave(&lp->lock, flags);
-		SMC_GET_MII(lp, i, lp->mii.phy_id, reg);
-		spin_unlock_irqrestore(&lp->lock, flags);
-		data[j++] = reg & 0xFFFF;
-	}
-}
-
-static int smc911x_ethtool_wait_eeprom_ready(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	unsigned int timeout;
-	int e2p_cmd;
-
-	e2p_cmd = SMC_GET_E2P_CMD(lp);
-	for(timeout=10;(e2p_cmd & E2P_CMD_EPC_BUSY_) && timeout; timeout--) {
-		if (e2p_cmd & E2P_CMD_EPC_TIMEOUT_) {
-			PRINTK(dev, "%s timeout waiting for EEPROM to respond\n",
-			       __func__);
-			return -EFAULT;
-		}
-		mdelay(1);
-		e2p_cmd = SMC_GET_E2P_CMD(lp);
-	}
-	if (timeout == 0) {
-		PRINTK(dev, "%s timeout waiting for EEPROM CMD not busy\n",
-		       __func__);
-		return -ETIMEDOUT;
-	}
-	return 0;
-}
-
-static inline int smc911x_ethtool_write_eeprom_cmd(struct net_device *dev,
-						   int cmd, int addr)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int ret;
-
-	if ((ret = smc911x_ethtool_wait_eeprom_ready(dev))!=0)
-		return ret;
-	SMC_SET_E2P_CMD(lp, E2P_CMD_EPC_BUSY_ |
-		((cmd) & (0x7<<28)) |
-		((addr) & 0xFF));
-	return 0;
-}
-
-static inline int smc911x_ethtool_read_eeprom_byte(struct net_device *dev,
-						   u8 *data)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int ret;
-
-	if ((ret = smc911x_ethtool_wait_eeprom_ready(dev))!=0)
-		return ret;
-	*data = SMC_GET_E2P_DATA(lp);
-	return 0;
-}
-
-static inline int smc911x_ethtool_write_eeprom_byte(struct net_device *dev,
-						    u8 data)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int ret;
-
-	if ((ret = smc911x_ethtool_wait_eeprom_ready(dev))!=0)
-		return ret;
-	SMC_SET_E2P_DATA(lp, data);
-	return 0;
-}
-
-static int smc911x_ethtool_geteeprom(struct net_device *dev,
-				     struct ethtool_eeprom *eeprom, u8 *data)
-{
-	u8 eebuf[SMC911X_EEPROM_LEN];
-	int i, ret;
-
-	for(i=0;i<SMC911X_EEPROM_LEN;i++) {
-		if ((ret=smc911x_ethtool_write_eeprom_cmd(dev, E2P_CMD_EPC_CMD_READ_, i ))!=0)
-			return ret;
-		if ((ret=smc911x_ethtool_read_eeprom_byte(dev, &eebuf[i]))!=0)
-			return ret;
-	}
-	memcpy(data, eebuf+eeprom->offset, eeprom->len);
-	return 0;
-}
-
-static int smc911x_ethtool_seteeprom(struct net_device *dev,
-				     struct ethtool_eeprom *eeprom, u8 *data)
-{
-	int i, ret;
-
-	/* Enable erase */
-	if ((ret=smc911x_ethtool_write_eeprom_cmd(dev, E2P_CMD_EPC_CMD_EWEN_, 0 ))!=0)
-		return ret;
-	for(i=eeprom->offset;i<(eeprom->offset+eeprom->len);i++) {
-		/* erase byte */
-		if ((ret=smc911x_ethtool_write_eeprom_cmd(dev, E2P_CMD_EPC_CMD_ERASE_, i ))!=0)
-			return ret;
-		/* write byte */
-		if ((ret=smc911x_ethtool_write_eeprom_byte(dev, *data))!=0)
-			return ret;
-		if ((ret=smc911x_ethtool_write_eeprom_cmd(dev, E2P_CMD_EPC_CMD_WRITE_, i ))!=0)
-			return ret;
-	}
-	return 0;
-}
-
-static int smc911x_ethtool_geteeprom_len(struct net_device *dev)
-{
-	 return SMC911X_EEPROM_LEN;
-}
-
-static const struct ethtool_ops smc911x_ethtool_ops = {
-	.get_drvinfo	 = smc911x_ethtool_getdrvinfo,
-	.get_msglevel	 = smc911x_ethtool_getmsglevel,
-	.set_msglevel	 = smc911x_ethtool_setmsglevel,
-	.nway_reset = smc911x_ethtool_nwayreset,
-	.get_link	 = ethtool_op_get_link,
-	.get_regs_len	 = smc911x_ethtool_getregslen,
-	.get_regs	 = smc911x_ethtool_getregs,
-	.get_eeprom_len = smc911x_ethtool_geteeprom_len,
-	.get_eeprom = smc911x_ethtool_geteeprom,
-	.set_eeprom = smc911x_ethtool_seteeprom,
-	.get_link_ksettings	 = smc911x_ethtool_get_link_ksettings,
-	.set_link_ksettings	 = smc911x_ethtool_set_link_ksettings,
-};
-
-/*
- * smc911x_findirq
- *
- * This routine has a simple purpose -- make the SMC chip generate an
- * interrupt, so an auto-detect routine can detect it, and find the IRQ,
- */
-static int smc911x_findirq(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int timeout = 20;
-	unsigned long cookie;
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	cookie = probe_irq_on();
-
-	/*
-	 * Force a SW interrupt
-	 */
-
-	SMC_SET_INT_EN(lp, INT_EN_SW_INT_EN_);
-
-	/*
-	 * Wait until positive that the interrupt has been generated
-	 */
-	do {
-		int int_status;
-		udelay(10);
-		int_status = SMC_GET_INT_EN(lp);
-		if (int_status & INT_EN_SW_INT_EN_)
-			 break;		/* got the interrupt */
-	} while (--timeout);
-
-	/*
-	 * there is really nothing that I can do here if timeout fails,
-	 * as autoirq_report will return a 0 anyway, which is what I
-	 * want in this case.	 Plus, the clean up is needed in both
-	 * cases.
-	 */
-
-	/* and disable all interrupts again */
-	SMC_SET_INT_EN(lp, 0);
-
-	/* and return what I found */
-	return probe_irq_off(cookie);
-}
-
-static const struct net_device_ops smc911x_netdev_ops = {
-	.ndo_open		= smc911x_open,
-	.ndo_stop		= smc911x_close,
-	.ndo_start_xmit		= smc911x_hard_start_xmit,
-	.ndo_tx_timeout		= smc911x_timeout,
-	.ndo_set_rx_mode	= smc911x_set_multicast_list,
-	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_set_mac_address	= eth_mac_addr,
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	.ndo_poll_controller	= smc911x_poll_controller,
-#endif
-};
-
-/*
- * Function: smc911x_probe(unsigned long ioaddr)
- *
- * Purpose:
- *	 Tests to see if a given ioaddr points to an SMC911x chip.
- *	 Returns a 0 on success
- *
- * Algorithm:
- *	 (1) see if the endian word is OK
- *	 (1) see if I recognize the chip ID in the appropriate register
- *
- * Here I do typical initialization tasks.
- *
- * o  Initialize the structure if needed
- * o  print out my vanity message if not done so already
- * o  print out what type of hardware is detected
- * o  print out the ethernet address
- * o  find the IRQ
- * o  set up my private data
- * o  configure the dev structure with my subroutines
- * o  actually GRAB the irq.
- * o  GRAB the region
- */
-static int smc911x_probe(struct net_device *dev)
-{
-	struct smc911x_local *lp = netdev_priv(dev);
-	int i, retval;
-	unsigned int val, chip_id, revision;
-	const char *version_string;
-	unsigned long irq_flags;
-#ifdef SMC_USE_DMA
-	struct dma_slave_config	config;
-	dma_cap_mask_t mask;
-#endif
-	u8 addr[ETH_ALEN];
-
-	DBG(SMC_DEBUG_FUNC, dev, "--> %s\n", __func__);
-
-	/* First, see if the endian word is recognized */
-	val = SMC_GET_BYTE_TEST(lp);
-	DBG(SMC_DEBUG_MISC, dev, "%s: endian probe returned 0x%04x\n",
-	    CARDNAME, val);
-	if (val != 0x87654321) {
-		netdev_err(dev, "Invalid chip endian 0x%08x\n", val);
-		retval = -ENODEV;
-		goto err_out;
-	}
-
-	/*
-	 * check if the revision register is something that I
-	 * recognize.	These might need to be added to later,
-	 * as future revisions could be added.
-	 */
-	chip_id = SMC_GET_PN(lp);
-	DBG(SMC_DEBUG_MISC, dev, "%s: id probe returned 0x%04x\n",
-	    CARDNAME, chip_id);
-	for(i=0;chip_ids[i].id != 0; i++) {
-		if (chip_ids[i].id == chip_id) break;
-	}
-	if (!chip_ids[i].id) {
-		netdev_err(dev, "Unknown chip ID %04x\n", chip_id);
-		retval = -ENODEV;
-		goto err_out;
-	}
-	version_string = chip_ids[i].name;
-
-	revision = SMC_GET_REV(lp);
-	DBG(SMC_DEBUG_MISC, dev, "%s: revision = 0x%04x\n", CARDNAME, revision);
-
-	/* At this point I'll assume that the chip is an SMC911x. */
-	DBG(SMC_DEBUG_MISC, dev, "%s: Found a %s\n",
-	    CARDNAME, chip_ids[i].name);
-
-	/* Validate the TX FIFO size requested */
-	if ((tx_fifo_kb < 2) || (tx_fifo_kb > 14)) {
-		netdev_err(dev, "Invalid TX FIFO size requested %d\n",
-			   tx_fifo_kb);
-		retval = -EINVAL;
-		goto err_out;
-	}
-
-	/* fill in some of the fields */
-	lp->version = chip_ids[i].id;
-	lp->revision = revision;
-	lp->tx_fifo_kb = tx_fifo_kb;
-	/* Reverse calculate the RX FIFO size from the TX */
-	lp->tx_fifo_size=(lp->tx_fifo_kb<<10) - 512;
-	lp->rx_fifo_size= ((0x4000 - 512 - lp->tx_fifo_size) / 16) * 15;
-
-	/* Set the automatic flow control values */
-	switch(lp->tx_fifo_kb) {
-		/*
-		 *	 AFC_HI is about ((Rx Data Fifo Size)*2/3)/64
-		 *	 AFC_LO is AFC_HI/2
-		 *	 BACK_DUR is about 5uS*(AFC_LO) rounded down
-		 */
-		case 2:/* 13440 Rx Data Fifo Size */
-			lp->afc_cfg=0x008C46AF;break;
-		case 3:/* 12480 Rx Data Fifo Size */
-			lp->afc_cfg=0x0082419F;break;
-		case 4:/* 11520 Rx Data Fifo Size */
-			lp->afc_cfg=0x00783C9F;break;
-		case 5:/* 10560 Rx Data Fifo Size */
-			lp->afc_cfg=0x006E374F;break;
-		case 6:/* 9600 Rx Data Fifo Size */
-			lp->afc_cfg=0x0064328F;break;
-		case 7:/* 8640 Rx Data Fifo Size */
-			lp->afc_cfg=0x005A2D7F;break;
-		case 8:/* 7680 Rx Data Fifo Size */
-			lp->afc_cfg=0x0050287F;break;
-		case 9:/* 6720 Rx Data Fifo Size */
-			lp->afc_cfg=0x0046236F;break;
-		case 10:/* 5760 Rx Data Fifo Size */
-			lp->afc_cfg=0x003C1E6F;break;
-		case 11:/* 4800 Rx Data Fifo Size */
-			lp->afc_cfg=0x0032195F;break;
-		/*
-		 *	 AFC_HI is ~1520 bytes less than RX Data Fifo Size
-		 *	 AFC_LO is AFC_HI/2
-		 *	 BACK_DUR is about 5uS*(AFC_LO) rounded down
-		 */
-		case 12:/* 3840 Rx Data Fifo Size */
-			lp->afc_cfg=0x0024124F;break;
-		case 13:/* 2880 Rx Data Fifo Size */
-			lp->afc_cfg=0x0015073F;break;
-		case 14:/* 1920 Rx Data Fifo Size */
-			lp->afc_cfg=0x0006032F;break;
-		 default:
-			 PRINTK(dev, "ERROR -- no AFC_CFG setting found");
-			 break;
-	}
-
-	DBG(SMC_DEBUG_MISC | SMC_DEBUG_TX | SMC_DEBUG_RX, dev,
-	    "%s: tx_fifo %d rx_fifo %d afc_cfg 0x%08x\n", CARDNAME,
-	    lp->tx_fifo_size, lp->rx_fifo_size, lp->afc_cfg);
-
-	spin_lock_init(&lp->lock);
-
-	/* Get the MAC address */
-	SMC_GET_MAC_ADDR(lp, addr);
-	eth_hw_addr_set(dev, addr);
-
-	/* now, reset the chip, and put it into a known state */
-	smc911x_reset(dev);
-
-	/*
-	 * If dev->irq is 0, then the device has to be banged on to see
-	 * what the IRQ is.
-	 *
-	 * Specifying an IRQ is done with the assumption that the user knows
-	 * what (s)he is doing.  No checking is done!!!!
-	 */
-	if (dev->irq < 1) {
-		int trials;
-
-		trials = 3;
-		while (trials--) {
-			dev->irq = smc911x_findirq(dev);
-			if (dev->irq)
-				break;
-			/* kick the card and try again */
-			smc911x_reset(dev);
-		}
-	}
-	if (dev->irq == 0) {
-		netdev_warn(dev, "Couldn't autodetect your IRQ. Use irq=xx.\n");
-		retval = -ENODEV;
-		goto err_out;
-	}
-	dev->irq = irq_canonicalize(dev->irq);
-
-	dev->netdev_ops = &smc911x_netdev_ops;
-	dev->watchdog_timeo = msecs_to_jiffies(watchdog);
-	dev->ethtool_ops = &smc911x_ethtool_ops;
-
-	INIT_WORK(&lp->phy_configure, smc911x_phy_configure);
-	lp->mii.phy_id_mask = 0x1f;
-	lp->mii.reg_num_mask = 0x1f;
-	lp->mii.force_media = 0;
-	lp->mii.full_duplex = 0;
-	lp->mii.dev = dev;
-	lp->mii.mdio_read = smc911x_phy_read;
-	lp->mii.mdio_write = smc911x_phy_write;
-
-	/*
-	 * Locate the phy, if any.
-	 */
-	smc911x_phy_detect(dev);
-
-	/* Set default parameters */
-	lp->msg_enable = NETIF_MSG_LINK;
-	lp->ctl_rfduplx = 1;
-	lp->ctl_rspeed = 100;
-
-#ifdef SMC_DYNAMIC_BUS_CONFIG
-	irq_flags = lp->cfg.irq_flags;
-#else
-	irq_flags = IRQF_SHARED | SMC_IRQ_SENSE;
-#endif
-
-	/* Grab the IRQ */
-	retval = request_irq(dev->irq, smc911x_interrupt,
-			     irq_flags, dev->name, dev);
-	if (retval)
-		goto err_out;
-
-#ifdef SMC_USE_DMA
-
-	dma_cap_zero(mask);
-	dma_cap_set(DMA_SLAVE, mask);
-	lp->rxdma = dma_request_channel(mask, NULL, NULL);
-	lp->txdma = dma_request_channel(mask, NULL, NULL);
-	lp->rxdma_active = 0;
-	lp->txdma_active = 0;
-
-	memset(&config, 0, sizeof(config));
-	config.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
-	config.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
-	config.src_addr = lp->physaddr + RX_DATA_FIFO;
-	config.dst_addr = lp->physaddr + TX_DATA_FIFO;
-	config.src_maxburst = 32;
-	config.dst_maxburst = 32;
-	retval = dmaengine_slave_config(lp->rxdma, &config);
-	if (retval) {
-		dev_err(lp->dev, "dma rx channel configuration failed: %d\n",
-			retval);
-		goto err_out;
-	}
-	retval = dmaengine_slave_config(lp->txdma, &config);
-	if (retval) {
-		dev_err(lp->dev, "dma tx channel configuration failed: %d\n",
-			retval);
-		goto err_out;
-	}
-#endif
-
-	retval = register_netdev(dev);
-	if (retval == 0) {
-		/* now, print out the card info, in a short format.. */
-		netdev_info(dev, "%s (rev %d) at %#lx IRQ %d",
-			    version_string, lp->revision,
-			    dev->base_addr, dev->irq);
-
-#ifdef SMC_USE_DMA
-		if (lp->rxdma)
-			pr_cont(" RXDMA %p", lp->rxdma);
-
-		if (lp->txdma)
-			pr_cont(" TXDMA %p", lp->txdma);
-#endif
-		pr_cont("\n");
-		if (!is_valid_ether_addr(dev->dev_addr)) {
-			netdev_warn(dev, "Invalid ethernet MAC address. Please set using ifconfig\n");
-		} else {
-			/* Print the Ethernet address */
-			netdev_info(dev, "Ethernet addr: %pM\n",
-				    dev->dev_addr);
-		}
-
-		if (lp->phy_type == 0) {
-			PRINTK(dev, "No PHY found\n");
-		} else if ((lp->phy_type & ~0xff) == LAN911X_INTERNAL_PHY_ID) {
-			PRINTK(dev, "LAN911x Internal PHY\n");
-		} else {
-			PRINTK(dev, "External PHY 0x%08x\n", lp->phy_type);
-		}
-	}
-
-err_out:
-#ifdef SMC_USE_DMA
-	if (retval) {
-		if (lp->rxdma)
-			dma_release_channel(lp->rxdma);
-		if (lp->txdma)
-			dma_release_channel(lp->txdma);
-	}
-#endif
-	return retval;
-}
-
-/*
- * smc911x_drv_probe(void)
- *
- *	  Output:
- *	 0 --> there is a device
- *	 anything else, error
- */
-static int smc911x_drv_probe(struct platform_device *pdev)
-{
-	struct net_device *ndev;
-	struct resource *res;
-	struct smc911x_local *lp;
-	void __iomem *addr;
-	int ret;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	/*
-	 * Request the regions.
-	 */
-	if (!request_mem_region(res->start, SMC911X_IO_EXTENT, CARDNAME)) {
-		 ret = -EBUSY;
-		 goto out;
-	}
-
-	ndev = alloc_etherdev(sizeof(struct smc911x_local));
-	if (!ndev) {
-		ret = -ENOMEM;
-		goto release_1;
-	}
-	SET_NETDEV_DEV(ndev, &pdev->dev);
-
-	ndev->dma = (unsigned char)-1;
-	ndev->irq = platform_get_irq(pdev, 0);
-	if (ndev->irq < 0) {
-		ret = ndev->irq;
-		goto release_both;
-	}
-
-	lp = netdev_priv(ndev);
-	lp->netdev = ndev;
-#ifdef SMC_DYNAMIC_BUS_CONFIG
-	{
-		struct smc911x_platdata *pd = dev_get_platdata(&pdev->dev);
-		if (!pd) {
-			ret = -EINVAL;
-			goto release_both;
-		}
-		memcpy(&lp->cfg, pd, sizeof(lp->cfg));
-	}
-#endif
-
-	addr = ioremap(res->start, SMC911X_IO_EXTENT);
-	if (!addr) {
-		ret = -ENOMEM;
-		goto release_both;
-	}
-
-	platform_set_drvdata(pdev, ndev);
-	lp->base = addr;
-	ndev->base_addr = res->start;
-	ret = smc911x_probe(ndev);
-	if (ret != 0) {
-		iounmap(addr);
-release_both:
-		free_netdev(ndev);
-release_1:
-		release_mem_region(res->start, SMC911X_IO_EXTENT);
-out:
-		pr_info("%s: not found (%d).\n", CARDNAME, ret);
-	}
-#ifdef SMC_USE_DMA
-	else {
-		lp->physaddr = res->start;
-		lp->dev = &pdev->dev;
-	}
-#endif
-
-	return ret;
-}
-
-static int smc911x_drv_remove(struct platform_device *pdev)
-{
-	struct net_device *ndev = platform_get_drvdata(pdev);
-	struct smc911x_local *lp = netdev_priv(ndev);
-	struct resource *res;
-
-	DBG(SMC_DEBUG_FUNC, ndev, "--> %s\n", __func__);
-
-	unregister_netdev(ndev);
-
-	free_irq(ndev->irq, ndev);
-
-#ifdef SMC_USE_DMA
-	{
-		if (lp->rxdma)
-			dma_release_channel(lp->rxdma);
-		if (lp->txdma)
-			dma_release_channel(lp->txdma);
-	}
-#endif
-	iounmap(lp->base);
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	release_mem_region(res->start, SMC911X_IO_EXTENT);
-
-	free_netdev(ndev);
-	return 0;
-}
-
-static int smc911x_drv_suspend(struct platform_device *dev, pm_message_t state)
-{
-	struct net_device *ndev = platform_get_drvdata(dev);
-	struct smc911x_local *lp = netdev_priv(ndev);
-
-	DBG(SMC_DEBUG_FUNC, ndev, "--> %s\n", __func__);
-	if (ndev) {
-		if (netif_running(ndev)) {
-			netif_device_detach(ndev);
-			smc911x_shutdown(ndev);
-#if POWER_DOWN
-			/* Set D2 - Energy detect only setting */
-			SMC_SET_PMT_CTRL(lp, 2<<12);
-#endif
-		}
-	}
-	return 0;
-}
-
-static int smc911x_drv_resume(struct platform_device *dev)
-{
-	struct net_device *ndev = platform_get_drvdata(dev);
-
-	DBG(SMC_DEBUG_FUNC, ndev, "--> %s\n", __func__);
-	if (ndev) {
-		struct smc911x_local *lp = netdev_priv(ndev);
-
-		if (netif_running(ndev)) {
-			smc911x_reset(ndev);
-			if (lp->phy_type != 0)
-				smc911x_phy_configure(&lp->phy_configure);
-			smc911x_enable(ndev);
-			netif_device_attach(ndev);
-		}
-	}
-	return 0;
-}
-
-static struct platform_driver smc911x_driver = {
-	.probe		 = smc911x_drv_probe,
-	.remove	 = smc911x_drv_remove,
-	.suspend	 = smc911x_drv_suspend,
-	.resume	 = smc911x_drv_resume,
-	.driver	 = {
-		.name	 = CARDNAME,
-	},
-};
-
-module_platform_driver(smc911x_driver);
diff --git a/drivers/net/ethernet/smsc/smc911x.h b/drivers/net/ethernet/smsc/smc911x.h
deleted file mode 100644
index d4edcc0da87c..000000000000
--- a/drivers/net/ethernet/smsc/smc911x.h
+++ /dev/null
@@ -1,901 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*------------------------------------------------------------------------
- . smc911x.h - macros for SMSC's LAN911{5,6,7,8} single-chip Ethernet device.
- .
- . Copyright (C) 2005 Sensoria Corp.
- . Derived from the unified SMC91x driver by Nicolas Pitre
- .
- .
- . Information contained in this file was obtained from the LAN9118
- . manual from SMC.  To get a copy, if you really want one, you can find
- . information under www.smsc.com.
- .
- . Authors
- .	 Dustin McIntire		 <dustin@sensoria.com>
- .
- ---------------------------------------------------------------------------*/
-#ifndef _SMC911X_H_
-#define _SMC911X_H_
-
-#include <linux/smc911x.h>
-/*
- * Use the DMA feature on PXA chips
- */
-#ifdef CONFIG_ARCH_PXA
-  #define SMC_USE_PXA_DMA	1
-  #define SMC_USE_16BIT		0
-  #define SMC_USE_32BIT		1
-  #define SMC_IRQ_SENSE		IRQF_TRIGGER_FALLING
-#elif defined(CONFIG_SH_MAGIC_PANEL_R2)
-  #define SMC_USE_16BIT		0
-  #define SMC_USE_32BIT		1
-  #define SMC_IRQ_SENSE		IRQF_TRIGGER_LOW
-#elif defined(CONFIG_ARCH_OMAP3)
-  #define SMC_USE_16BIT		0
-  #define SMC_USE_32BIT		1
-  #define SMC_IRQ_SENSE		IRQF_TRIGGER_LOW
-  #define SMC_MEM_RESERVED	1
-#elif defined(CONFIG_ARCH_OMAP2)
-  #define SMC_USE_16BIT		0
-  #define SMC_USE_32BIT		1
-  #define SMC_IRQ_SENSE		IRQF_TRIGGER_LOW
-  #define SMC_MEM_RESERVED	1
-#else
-/*
- * Default configuration
- */
-
-#define SMC_DYNAMIC_BUS_CONFIG
-#endif
-
-#ifdef SMC_USE_PXA_DMA
-#define SMC_USE_DMA
-#endif
-
-/* store this information for the driver.. */
-struct smc911x_local {
-	/*
-	 * If I have to wait until the DMA is finished and ready to reload a
-	 * packet, I will store the skbuff here. Then, the DMA will send it
-	 * out and free it.
-	 */
-	struct sk_buff *pending_tx_skb;
-
-	/* version/revision of the SMC911x chip */
-	u16 version;
-	u16 revision;
-
-	/* FIFO sizes */
-	int tx_fifo_kb;
-	int tx_fifo_size;
-	int rx_fifo_size;
-	int afc_cfg;
-
-	/* Contains the current active receive/phy mode */
-	int ctl_rfduplx;
-	int ctl_rspeed;
-
-	u32 msg_enable;
-	u32 phy_type;
-	struct mii_if_info mii;
-
-	/* work queue */
-	struct work_struct phy_configure;
-
-	int tx_throttle;
-	spinlock_t lock;
-
-	struct net_device *netdev;
-
-#ifdef SMC_USE_DMA
-	/* DMA needs the physical address of the chip */
-	u_long physaddr;
-	struct dma_chan *rxdma;
-	struct dma_chan *txdma;
-	int rxdma_active;
-	int txdma_active;
-	struct sk_buff *current_rx_skb;
-	struct sk_buff *current_tx_skb;
-	struct device *dev;
-#endif
-	void __iomem *base;
-#ifdef SMC_DYNAMIC_BUS_CONFIG
-	struct smc911x_platdata cfg;
-#endif
-};
-
-/*
- * Define the bus width specific IO macros
- */
-
-#ifdef SMC_DYNAMIC_BUS_CONFIG
-static inline unsigned int SMC_inl(struct smc911x_local *lp, int reg)
-{
-	void __iomem *ioaddr = lp->base + reg;
-
-	if (lp->cfg.flags & SMC911X_USE_32BIT)
-		return readl(ioaddr);
-
-	if (lp->cfg.flags & SMC911X_USE_16BIT)
-		return readw(ioaddr) | (readw(ioaddr + 2) << 16);
-
-	BUG();
-}
-
-static inline void SMC_outl(unsigned int value, struct smc911x_local *lp,
-			    int reg)
-{
-	void __iomem *ioaddr = lp->base + reg;
-
-	if (lp->cfg.flags & SMC911X_USE_32BIT) {
-		writel(value, ioaddr);
-		return;
-	}
-
-	if (lp->cfg.flags & SMC911X_USE_16BIT) {
-		writew(value & 0xffff, ioaddr);
-		writew(value >> 16, ioaddr + 2);
-		return;
-	}
-
-	BUG();
-}
-
-static inline void SMC_insl(struct smc911x_local *lp, int reg,
-			      void *addr, unsigned int count)
-{
-	void __iomem *ioaddr = lp->base + reg;
-
-	if (lp->cfg.flags & SMC911X_USE_32BIT) {
-		ioread32_rep(ioaddr, addr, count);
-		return;
-	}
-
-	if (lp->cfg.flags & SMC911X_USE_16BIT) {
-		ioread16_rep(ioaddr, addr, count * 2);
-		return;
-	}
-
-	BUG();
-}
-
-static inline void SMC_outsl(struct smc911x_local *lp, int reg,
-			     void *addr, unsigned int count)
-{
-	void __iomem *ioaddr = lp->base + reg;
-
-	if (lp->cfg.flags & SMC911X_USE_32BIT) {
-		iowrite32_rep(ioaddr, addr, count);
-		return;
-	}
-
-	if (lp->cfg.flags & SMC911X_USE_16BIT) {
-		iowrite16_rep(ioaddr, addr, count * 2);
-		return;
-	}
-
-	BUG();
-}
-#else
-#if	SMC_USE_16BIT
-#define SMC_inl(lp, r)		 ((readw((lp)->base + (r)) & 0xFFFF) + (readw((lp)->base + (r) + 2) << 16))
-#define SMC_outl(v, lp, r) 			 \
-	do{					 \
-		 writew(v & 0xFFFF, (lp)->base + (r));	 \
-		 writew(v >> 16, (lp)->base + (r) + 2); \
-	 } while (0)
-#define SMC_insl(lp, r, p, l)	 ioread16_rep((short*)((lp)->base + (r)), p, l*2)
-#define SMC_outsl(lp, r, p, l)	 iowrite16_rep((short*)((lp)->base + (r)), p, l*2)
-
-#elif	SMC_USE_32BIT
-#define SMC_inl(lp, r)		 readl((lp)->base + (r))
-#define SMC_outl(v, lp, r)	 writel(v, (lp)->base + (r))
-#define SMC_insl(lp, r, p, l)	 ioread32_rep((int*)((lp)->base + (r)), p, l)
-#define SMC_outsl(lp, r, p, l)	 iowrite32_rep((int*)((lp)->base + (r)), p, l)
-
-#endif /* SMC_USE_16BIT */
-#endif /* SMC_DYNAMIC_BUS_CONFIG */
-
-
-#ifdef SMC_USE_PXA_DMA
-
-/*
- * Use a DMA for RX and TX packets.
- */
-#include <linux/dma-mapping.h>
-
-static dma_addr_t rx_dmabuf, tx_dmabuf;
-static int rx_dmalen, tx_dmalen;
-static void smc911x_rx_dma_irq(void *data);
-static void smc911x_tx_dma_irq(void *data);
-
-#ifdef SMC_insl
-#undef SMC_insl
-#define SMC_insl(lp, r, p, l) \
-	smc_pxa_dma_insl(lp, lp->physaddr, r, lp->rxdma, p, l)
-
-static inline void
-smc_pxa_dma_insl(struct smc911x_local *lp, u_long physaddr,
-		int reg, struct dma_chan *dma, u_char *buf, int len)
-{
-	struct dma_async_tx_descriptor *tx;
-
-	/* 64 bit alignment is required for memory to memory DMA */
-	if ((long)buf & 4) {
-		*((u32 *)buf) = SMC_inl(lp, reg);
-		buf += 4;
-		len--;
-	}
-
-	len *= 4;
-	rx_dmabuf = dma_map_single(lp->dev, buf, len, DMA_FROM_DEVICE);
-	rx_dmalen = len;
-	tx = dmaengine_prep_slave_single(dma, rx_dmabuf, rx_dmalen,
-					 DMA_DEV_TO_MEM, 0);
-	if (tx) {
-		tx->callback = smc911x_rx_dma_irq;
-		tx->callback_param = lp;
-		dmaengine_submit(tx);
-		dma_async_issue_pending(dma);
-	}
-}
-#endif
-
-#ifdef SMC_outsl
-#undef SMC_outsl
-#define SMC_outsl(lp, r, p, l) \
-	 smc_pxa_dma_outsl(lp, lp->physaddr, r, lp->txdma, p, l)
-
-static inline void
-smc_pxa_dma_outsl(struct smc911x_local *lp, u_long physaddr,
-		int reg, struct dma_chan *dma, u_char *buf, int len)
-{
-	struct dma_async_tx_descriptor *tx;
-
-	/* 64 bit alignment is required for memory to memory DMA */
-	if ((long)buf & 4) {
-		SMC_outl(*((u32 *)buf), lp, reg);
-		buf += 4;
-		len--;
-	}
-
-	len *= 4;
-	tx_dmabuf = dma_map_single(lp->dev, buf, len, DMA_TO_DEVICE);
-	tx_dmalen = len;
-	tx = dmaengine_prep_slave_single(dma, tx_dmabuf, tx_dmalen,
-					 DMA_DEV_TO_MEM, 0);
-	if (tx) {
-		tx->callback = smc911x_tx_dma_irq;
-		tx->callback_param = lp;
-		dmaengine_submit(tx);
-		dma_async_issue_pending(dma);
-	}
-}
-#endif
-#endif	 /* SMC_USE_PXA_DMA */
-
-
-/* Chip Parameters and Register Definitions */
-
-#define SMC911X_TX_FIFO_LOW_THRESHOLD	(1536*2)
-
-#define SMC911X_IO_EXTENT	 0x100
-
-#define SMC911X_EEPROM_LEN	 7
-
-/* Below are the register offsets and bit definitions
- * of the Lan911x memory space
- */
-#define RX_DATA_FIFO		 (0x00)
-
-#define TX_DATA_FIFO		 (0x20)
-#define	TX_CMD_A_INT_ON_COMP_		(0x80000000)
-#define	TX_CMD_A_INT_BUF_END_ALGN_	(0x03000000)
-#define	TX_CMD_A_INT_4_BYTE_ALGN_	(0x00000000)
-#define	TX_CMD_A_INT_16_BYTE_ALGN_	(0x01000000)
-#define	TX_CMD_A_INT_32_BYTE_ALGN_	(0x02000000)
-#define	TX_CMD_A_INT_DATA_OFFSET_	(0x001F0000)
-#define	TX_CMD_A_INT_FIRST_SEG_		(0x00002000)
-#define	TX_CMD_A_INT_LAST_SEG_		(0x00001000)
-#define	TX_CMD_A_BUF_SIZE_		(0x000007FF)
-#define	TX_CMD_B_PKT_TAG_		(0xFFFF0000)
-#define	TX_CMD_B_ADD_CRC_DISABLE_	(0x00002000)
-#define	TX_CMD_B_DISABLE_PADDING_	(0x00001000)
-#define	TX_CMD_B_PKT_BYTE_LENGTH_	(0x000007FF)
-
-#define RX_STATUS_FIFO		(0x40)
-#define	RX_STS_PKT_LEN_			(0x3FFF0000)
-#define	RX_STS_ES_			(0x00008000)
-#define	RX_STS_BCST_			(0x00002000)
-#define	RX_STS_LEN_ERR_			(0x00001000)
-#define	RX_STS_RUNT_ERR_		(0x00000800)
-#define	RX_STS_MCAST_			(0x00000400)
-#define	RX_STS_TOO_LONG_		(0x00000080)
-#define	RX_STS_COLL_			(0x00000040)
-#define	RX_STS_ETH_TYPE_		(0x00000020)
-#define	RX_STS_WDOG_TMT_		(0x00000010)
-#define	RX_STS_MII_ERR_			(0x00000008)
-#define	RX_STS_DRIBBLING_		(0x00000004)
-#define	RX_STS_CRC_ERR_			(0x00000002)
-#define RX_STATUS_FIFO_PEEK 	(0x44)
-#define TX_STATUS_FIFO		(0x48)
-#define	TX_STS_TAG_			(0xFFFF0000)
-#define	TX_STS_ES_			(0x00008000)
-#define	TX_STS_LOC_			(0x00000800)
-#define	TX_STS_NO_CARR_			(0x00000400)
-#define	TX_STS_LATE_COLL_		(0x00000200)
-#define	TX_STS_MANY_COLL_		(0x00000100)
-#define	TX_STS_COLL_CNT_		(0x00000078)
-#define	TX_STS_MANY_DEFER_		(0x00000004)
-#define	TX_STS_UNDERRUN_		(0x00000002)
-#define	TX_STS_DEFERRED_		(0x00000001)
-#define TX_STATUS_FIFO_PEEK	(0x4C)
-#define ID_REV			(0x50)
-#define	ID_REV_CHIP_ID_			(0xFFFF0000)  /* RO */
-#define	ID_REV_REV_ID_			(0x0000FFFF)  /* RO */
-
-#define INT_CFG			(0x54)
-#define	INT_CFG_INT_DEAS_		(0xFF000000)  /* R/W */
-#define	INT_CFG_INT_DEAS_CLR_		(0x00004000)
-#define	INT_CFG_INT_DEAS_STS_		(0x00002000)
-#define	INT_CFG_IRQ_INT_		(0x00001000)  /* RO */
-#define	INT_CFG_IRQ_EN_			(0x00000100)  /* R/W */
-#define	INT_CFG_IRQ_POL_		(0x00000010)  /* R/W Not Affected by SW Reset */
-#define	INT_CFG_IRQ_TYPE_		(0x00000001)  /* R/W Not Affected by SW Reset */
-
-#define INT_STS			(0x58)
-#define	INT_STS_SW_INT_			(0x80000000)  /* R/WC */
-#define	INT_STS_TXSTOP_INT_		(0x02000000)  /* R/WC */
-#define	INT_STS_RXSTOP_INT_		(0x01000000)  /* R/WC */
-#define	INT_STS_RXDFH_INT_		(0x00800000)  /* R/WC */
-#define	INT_STS_RXDF_INT_		(0x00400000)  /* R/WC */
-#define	INT_STS_TX_IOC_			(0x00200000)  /* R/WC */
-#define	INT_STS_RXD_INT_		(0x00100000)  /* R/WC */
-#define	INT_STS_GPT_INT_		(0x00080000)  /* R/WC */
-#define	INT_STS_PHY_INT_		(0x00040000)  /* RO */
-#define	INT_STS_PME_INT_		(0x00020000)  /* R/WC */
-#define	INT_STS_TXSO_			(0x00010000)  /* R/WC */
-#define	INT_STS_RWT_			(0x00008000)  /* R/WC */
-#define	INT_STS_RXE_			(0x00004000)  /* R/WC */
-#define	INT_STS_TXE_			(0x00002000)  /* R/WC */
-//#define	INT_STS_ERX_		(0x00001000)  /* R/WC */
-#define	INT_STS_TDFU_			(0x00000800)  /* R/WC */
-#define	INT_STS_TDFO_			(0x00000400)  /* R/WC */
-#define	INT_STS_TDFA_			(0x00000200)  /* R/WC */
-#define	INT_STS_TSFF_			(0x00000100)  /* R/WC */
-#define	INT_STS_TSFL_			(0x00000080)  /* R/WC */
-//#define	INT_STS_RXDF_		(0x00000040)  /* R/WC */
-#define	INT_STS_RDFO_			(0x00000040)  /* R/WC */
-#define	INT_STS_RDFL_			(0x00000020)  /* R/WC */
-#define	INT_STS_RSFF_			(0x00000010)  /* R/WC */
-#define	INT_STS_RSFL_			(0x00000008)  /* R/WC */
-#define	INT_STS_GPIO2_INT_		(0x00000004)  /* R/WC */
-#define	INT_STS_GPIO1_INT_		(0x00000002)  /* R/WC */
-#define	INT_STS_GPIO0_INT_		(0x00000001)  /* R/WC */
-
-#define INT_EN			(0x5C)
-#define	INT_EN_SW_INT_EN_		(0x80000000)  /* R/W */
-#define	INT_EN_TXSTOP_INT_EN_		(0x02000000)  /* R/W */
-#define	INT_EN_RXSTOP_INT_EN_		(0x01000000)  /* R/W */
-#define	INT_EN_RXDFH_INT_EN_		(0x00800000)  /* R/W */
-//#define	INT_EN_RXDF_INT_EN_		(0x00400000)  /* R/W */
-#define	INT_EN_TIOC_INT_EN_		(0x00200000)  /* R/W */
-#define	INT_EN_RXD_INT_EN_		(0x00100000)  /* R/W */
-#define	INT_EN_GPT_INT_EN_		(0x00080000)  /* R/W */
-#define	INT_EN_PHY_INT_EN_		(0x00040000)  /* R/W */
-#define	INT_EN_PME_INT_EN_		(0x00020000)  /* R/W */
-#define	INT_EN_TXSO_EN_			(0x00010000)  /* R/W */
-#define	INT_EN_RWT_EN_			(0x00008000)  /* R/W */
-#define	INT_EN_RXE_EN_			(0x00004000)  /* R/W */
-#define	INT_EN_TXE_EN_			(0x00002000)  /* R/W */
-//#define	INT_EN_ERX_EN_			(0x00001000)  /* R/W */
-#define	INT_EN_TDFU_EN_			(0x00000800)  /* R/W */
-#define	INT_EN_TDFO_EN_			(0x00000400)  /* R/W */
-#define	INT_EN_TDFA_EN_			(0x00000200)  /* R/W */
-#define	INT_EN_TSFF_EN_			(0x00000100)  /* R/W */
-#define	INT_EN_TSFL_EN_			(0x00000080)  /* R/W */
-//#define	INT_EN_RXDF_EN_			(0x00000040)  /* R/W */
-#define	INT_EN_RDFO_EN_			(0x00000040)  /* R/W */
-#define	INT_EN_RDFL_EN_			(0x00000020)  /* R/W */
-#define	INT_EN_RSFF_EN_			(0x00000010)  /* R/W */
-#define	INT_EN_RSFL_EN_			(0x00000008)  /* R/W */
-#define	INT_EN_GPIO2_INT_		(0x00000004)  /* R/W */
-#define	INT_EN_GPIO1_INT_		(0x00000002)  /* R/W */
-#define	INT_EN_GPIO0_INT_		(0x00000001)  /* R/W */
-
-#define BYTE_TEST		(0x64)
-#define FIFO_INT		(0x68)
-#define	FIFO_INT_TX_AVAIL_LEVEL_	(0xFF000000)  /* R/W */
-#define	FIFO_INT_TX_STS_LEVEL_		(0x00FF0000)  /* R/W */
-#define	FIFO_INT_RX_AVAIL_LEVEL_	(0x0000FF00)  /* R/W */
-#define	FIFO_INT_RX_STS_LEVEL_		(0x000000FF)  /* R/W */
-
-#define RX_CFG			(0x6C)
-#define	RX_CFG_RX_END_ALGN_		(0xC0000000)  /* R/W */
-#define		RX_CFG_RX_END_ALGN4_		(0x00000000)  /* R/W */
-#define		RX_CFG_RX_END_ALGN16_		(0x40000000)  /* R/W */
-#define		RX_CFG_RX_END_ALGN32_		(0x80000000)  /* R/W */
-#define	RX_CFG_RX_DMA_CNT_		(0x0FFF0000)  /* R/W */
-#define	RX_CFG_RX_DUMP_			(0x00008000)  /* R/W */
-#define	RX_CFG_RXDOFF_			(0x00001F00)  /* R/W */
-//#define	RX_CFG_RXBAD_			(0x00000001)  /* R/W */
-
-#define TX_CFG			(0x70)
-//#define	TX_CFG_TX_DMA_LVL_		(0xE0000000)	 /* R/W */
-//#define	TX_CFG_TX_DMA_CNT_		(0x0FFF0000)	 /* R/W Self Clearing */
-#define	TX_CFG_TXS_DUMP_		(0x00008000)  /* Self Clearing */
-#define	TX_CFG_TXD_DUMP_		(0x00004000)  /* Self Clearing */
-#define	TX_CFG_TXSAO_			(0x00000004)  /* R/W */
-#define	TX_CFG_TX_ON_			(0x00000002)  /* R/W */
-#define	TX_CFG_STOP_TX_			(0x00000001)  /* Self Clearing */
-
-#define HW_CFG			(0x74)
-#define	HW_CFG_TTM_			(0x00200000)  /* R/W */
-#define	HW_CFG_SF_			(0x00100000)  /* R/W */
-#define	HW_CFG_TX_FIF_SZ_		(0x000F0000)  /* R/W */
-#define	HW_CFG_TR_			(0x00003000)  /* R/W */
-#define	HW_CFG_PHY_CLK_SEL_		(0x00000060)  /* R/W */
-#define		 HW_CFG_PHY_CLK_SEL_INT_PHY_ 	(0x00000000) /* R/W */
-#define		 HW_CFG_PHY_CLK_SEL_EXT_PHY_ 	(0x00000020) /* R/W */
-#define		 HW_CFG_PHY_CLK_SEL_CLK_DIS_ 	(0x00000040) /* R/W */
-#define	HW_CFG_SMI_SEL_			(0x00000010)  /* R/W */
-#define	HW_CFG_EXT_PHY_DET_		(0x00000008)  /* RO */
-#define	HW_CFG_EXT_PHY_EN_		(0x00000004)  /* R/W */
-#define	HW_CFG_32_16_BIT_MODE_		(0x00000004)  /* RO */
-#define	HW_CFG_SRST_TO_			(0x00000002)  /* RO */
-#define	HW_CFG_SRST_			(0x00000001)  /* Self Clearing */
-
-#define RX_DP_CTRL		(0x78)
-#define	RX_DP_CTRL_RX_FFWD_		(0x80000000)  /* R/W */
-#define	RX_DP_CTRL_FFWD_BUSY_		(0x80000000)  /* RO */
-
-#define RX_FIFO_INF		(0x7C)
-#define	 RX_FIFO_INF_RXSUSED_		(0x00FF0000)  /* RO */
-#define	 RX_FIFO_INF_RXDUSED_		(0x0000FFFF)  /* RO */
-
-#define TX_FIFO_INF		(0x80)
-#define	TX_FIFO_INF_TSUSED_		(0x00FF0000)  /* RO */
-#define	TX_FIFO_INF_TDFREE_		(0x0000FFFF)  /* RO */
-
-#define PMT_CTRL		(0x84)
-#define	PMT_CTRL_PM_MODE_		(0x00003000)  /* Self Clearing */
-#define	PMT_CTRL_PHY_RST_		(0x00000400)  /* Self Clearing */
-#define	PMT_CTRL_WOL_EN_		(0x00000200)  /* R/W */
-#define	PMT_CTRL_ED_EN_			(0x00000100)  /* R/W */
-#define	PMT_CTRL_PME_TYPE_		(0x00000040)  /* R/W Not Affected by SW Reset */
-#define	PMT_CTRL_WUPS_			(0x00000030)  /* R/WC */
-#define		PMT_CTRL_WUPS_NOWAKE_		(0x00000000)  /* R/WC */
-#define		PMT_CTRL_WUPS_ED_		(0x00000010)  /* R/WC */
-#define		PMT_CTRL_WUPS_WOL_		(0x00000020)  /* R/WC */
-#define		PMT_CTRL_WUPS_MULTI_		(0x00000030)  /* R/WC */
-#define	PMT_CTRL_PME_IND_		(0x00000008)  /* R/W */
-#define	PMT_CTRL_PME_POL_		(0x00000004)  /* R/W */
-#define	PMT_CTRL_PME_EN_		(0x00000002)  /* R/W Not Affected by SW Reset */
-#define	PMT_CTRL_READY_			(0x00000001)  /* RO */
-
-#define GPIO_CFG		(0x88)
-#define	GPIO_CFG_LED3_EN_		(0x40000000)  /* R/W */
-#define	GPIO_CFG_LED2_EN_		(0x20000000)  /* R/W */
-#define	GPIO_CFG_LED1_EN_		(0x10000000)  /* R/W */
-#define	GPIO_CFG_GPIO2_INT_POL_		(0x04000000)  /* R/W */
-#define	GPIO_CFG_GPIO1_INT_POL_		(0x02000000)  /* R/W */
-#define	GPIO_CFG_GPIO0_INT_POL_		(0x01000000)  /* R/W */
-#define	GPIO_CFG_EEPR_EN_		(0x00700000)  /* R/W */
-#define	GPIO_CFG_GPIOBUF2_		(0x00040000)  /* R/W */
-#define	GPIO_CFG_GPIOBUF1_		(0x00020000)  /* R/W */
-#define	GPIO_CFG_GPIOBUF0_		(0x00010000)  /* R/W */
-#define	GPIO_CFG_GPIODIR2_		(0x00000400)  /* R/W */
-#define	GPIO_CFG_GPIODIR1_		(0x00000200)  /* R/W */
-#define	GPIO_CFG_GPIODIR0_		(0x00000100)  /* R/W */
-#define	GPIO_CFG_GPIOD4_		(0x00000010)  /* R/W */
-#define	GPIO_CFG_GPIOD3_		(0x00000008)  /* R/W */
-#define	GPIO_CFG_GPIOD2_		(0x00000004)  /* R/W */
-#define	GPIO_CFG_GPIOD1_		(0x00000002)  /* R/W */
-#define	GPIO_CFG_GPIOD0_		(0x00000001)  /* R/W */
-
-#define GPT_CFG			(0x8C)
-#define	GPT_CFG_TIMER_EN_		(0x20000000)  /* R/W */
-#define	GPT_CFG_GPT_LOAD_		(0x0000FFFF)  /* R/W */
-
-#define GPT_CNT			(0x90)
-#define	GPT_CNT_GPT_CNT_		(0x0000FFFF)  /* RO */
-
-#define ENDIAN			(0x98)
-#define FREE_RUN		(0x9C)
-#define RX_DROP			(0xA0)
-#define MAC_CSR_CMD		(0xA4)
-#define	 MAC_CSR_CMD_CSR_BUSY_		(0x80000000)  /* Self Clearing */
-#define	 MAC_CSR_CMD_R_NOT_W_		(0x40000000)  /* R/W */
-#define	 MAC_CSR_CMD_CSR_ADDR_		(0x000000FF)  /* R/W */
-
-#define MAC_CSR_DATA		(0xA8)
-#define AFC_CFG			(0xAC)
-#define		AFC_CFG_AFC_HI_			(0x00FF0000)  /* R/W */
-#define		AFC_CFG_AFC_LO_			(0x0000FF00)  /* R/W */
-#define		AFC_CFG_BACK_DUR_		(0x000000F0)  /* R/W */
-#define		AFC_CFG_FCMULT_			(0x00000008)  /* R/W */
-#define		AFC_CFG_FCBRD_			(0x00000004)  /* R/W */
-#define		AFC_CFG_FCADD_			(0x00000002)  /* R/W */
-#define		AFC_CFG_FCANY_			(0x00000001)  /* R/W */
-
-#define E2P_CMD			(0xB0)
-#define	E2P_CMD_EPC_BUSY_		(0x80000000)  /* Self Clearing */
-#define	E2P_CMD_EPC_CMD_			(0x70000000)  /* R/W */
-#define		E2P_CMD_EPC_CMD_READ_		(0x00000000)  /* R/W */
-#define		E2P_CMD_EPC_CMD_EWDS_		(0x10000000)  /* R/W */
-#define		E2P_CMD_EPC_CMD_EWEN_		(0x20000000)  /* R/W */
-#define		E2P_CMD_EPC_CMD_WRITE_		(0x30000000)  /* R/W */
-#define		E2P_CMD_EPC_CMD_WRAL_		(0x40000000)  /* R/W */
-#define		E2P_CMD_EPC_CMD_ERASE_		(0x50000000)  /* R/W */
-#define		E2P_CMD_EPC_CMD_ERAL_		(0x60000000)  /* R/W */
-#define		E2P_CMD_EPC_CMD_RELOAD_		(0x70000000)  /* R/W */
-#define	E2P_CMD_EPC_TIMEOUT_		(0x00000200)  /* RO */
-#define	E2P_CMD_MAC_ADDR_LOADED_	(0x00000100)  /* RO */
-#define	E2P_CMD_EPC_ADDR_		(0x000000FF)  /* R/W */
-
-#define E2P_DATA		(0xB4)
-#define	E2P_DATA_EEPROM_DATA_		(0x000000FF)  /* R/W */
-/* end of LAN register offsets and bit definitions */
-
-/*
- ****************************************************************************
- ****************************************************************************
- * MAC Control and Status Register (Indirect Address)
- * Offset (through the MAC_CSR CMD and DATA port)
- ****************************************************************************
- ****************************************************************************
- *
- */
-#define MAC_CR			(0x01)  /* R/W */
-
-/* MAC_CR - MAC Control Register */
-#define MAC_CR_RXALL_			(0x80000000)
-// TODO: delete this bit? It is not described in the data sheet.
-#define MAC_CR_HBDIS_			(0x10000000)
-#define MAC_CR_RCVOWN_			(0x00800000)
-#define MAC_CR_LOOPBK_			(0x00200000)
-#define MAC_CR_FDPX_			(0x00100000)
-#define MAC_CR_MCPAS_			(0x00080000)
-#define MAC_CR_PRMS_			(0x00040000)
-#define MAC_CR_INVFILT_			(0x00020000)
-#define MAC_CR_PASSBAD_			(0x00010000)
-#define MAC_CR_HFILT_			(0x00008000)
-#define MAC_CR_HPFILT_			(0x00002000)
-#define MAC_CR_LCOLL_			(0x00001000)
-#define MAC_CR_BCAST_			(0x00000800)
-#define MAC_CR_DISRTY_			(0x00000400)
-#define MAC_CR_PADSTR_			(0x00000100)
-#define MAC_CR_BOLMT_MASK_		(0x000000C0)
-#define MAC_CR_DFCHK_			(0x00000020)
-#define MAC_CR_TXEN_			(0x00000008)
-#define MAC_CR_RXEN_			(0x00000004)
-
-#define ADDRH			(0x02)	  /* R/W mask 0x0000FFFFUL */
-#define ADDRL			(0x03)	  /* R/W mask 0xFFFFFFFFUL */
-#define HASHH			(0x04)	  /* R/W */
-#define HASHL			(0x05)	  /* R/W */
-
-#define MII_ACC			(0x06)	  /* R/W */
-#define MII_ACC_PHY_ADDR_		(0x0000F800)
-#define MII_ACC_MIIRINDA_		(0x000007C0)
-#define MII_ACC_MII_WRITE_		(0x00000002)
-#define MII_ACC_MII_BUSY_		(0x00000001)
-
-#define MII_DATA		(0x07)	  /* R/W mask 0x0000FFFFUL */
-
-#define FLOW			(0x08)	  /* R/W */
-#define FLOW_FCPT_			(0xFFFF0000)
-#define FLOW_FCPASS_			(0x00000004)
-#define FLOW_FCEN_			(0x00000002)
-#define FLOW_FCBSY_			(0x00000001)
-
-#define VLAN1			(0x09)	  /* R/W mask 0x0000FFFFUL */
-#define VLAN1_VTI1_			(0x0000ffff)
-
-#define VLAN2			(0x0A)	  /* R/W mask 0x0000FFFFUL */
-#define VLAN2_VTI2_			(0x0000ffff)
-
-#define WUFF			(0x0B)	  /* WO */
-
-#define WUCSR			(0x0C)	  /* R/W */
-#define WUCSR_GUE_			(0x00000200)
-#define WUCSR_WUFR_			(0x00000040)
-#define WUCSR_MPR_			(0x00000020)
-#define WUCSR_WAKE_EN_			(0x00000004)
-#define WUCSR_MPEN_			(0x00000002)
-
-/*
- ****************************************************************************
- * Chip Specific MII Defines
- ****************************************************************************
- *
- * Phy register offsets and bit definitions
- *
- */
-
-#define PHY_MODE_CTRL_STS	((u32)17)	/* Mode Control/Status Register */
-//#define MODE_CTRL_STS_FASTRIP_	  ((u16)0x4000)
-#define MODE_CTRL_STS_EDPWRDOWN_	 ((u16)0x2000)
-//#define MODE_CTRL_STS_LOWSQEN_	   ((u16)0x0800)
-//#define MODE_CTRL_STS_MDPREBP_	   ((u16)0x0400)
-//#define MODE_CTRL_STS_FARLOOPBACK_  ((u16)0x0200)
-//#define MODE_CTRL_STS_FASTEST_	   ((u16)0x0100)
-//#define MODE_CTRL_STS_REFCLKEN_	   ((u16)0x0010)
-//#define MODE_CTRL_STS_PHYADBP_	   ((u16)0x0008)
-//#define MODE_CTRL_STS_FORCE_G_LINK_ ((u16)0x0004)
-#define MODE_CTRL_STS_ENERGYON_	 	((u16)0x0002)
-
-#define PHY_INT_SRC			((u32)29)
-#define PHY_INT_SRC_ENERGY_ON_			((u16)0x0080)
-#define PHY_INT_SRC_ANEG_COMP_			((u16)0x0040)
-#define PHY_INT_SRC_REMOTE_FAULT_		((u16)0x0020)
-#define PHY_INT_SRC_LINK_DOWN_			((u16)0x0010)
-#define PHY_INT_SRC_ANEG_LP_ACK_		((u16)0x0008)
-#define PHY_INT_SRC_PAR_DET_FAULT_		((u16)0x0004)
-#define PHY_INT_SRC_ANEG_PGRX_			((u16)0x0002)
-
-#define PHY_INT_MASK			((u32)30)
-#define PHY_INT_MASK_ENERGY_ON_			((u16)0x0080)
-#define PHY_INT_MASK_ANEG_COMP_			((u16)0x0040)
-#define PHY_INT_MASK_REMOTE_FAULT_		((u16)0x0020)
-#define PHY_INT_MASK_LINK_DOWN_			((u16)0x0010)
-#define PHY_INT_MASK_ANEG_LP_ACK_		((u16)0x0008)
-#define PHY_INT_MASK_PAR_DET_FAULT_		((u16)0x0004)
-#define PHY_INT_MASK_ANEG_PGRX_			((u16)0x0002)
-
-#define PHY_SPECIAL			((u32)31)
-#define PHY_SPECIAL_ANEG_DONE_			((u16)0x1000)
-#define PHY_SPECIAL_RES_			((u16)0x0040)
-#define PHY_SPECIAL_RES_MASK_			((u16)0x0FE1)
-#define PHY_SPECIAL_SPD_			((u16)0x001C)
-#define PHY_SPECIAL_SPD_10HALF_			((u16)0x0004)
-#define PHY_SPECIAL_SPD_10FULL_			((u16)0x0014)
-#define PHY_SPECIAL_SPD_100HALF_		((u16)0x0008)
-#define PHY_SPECIAL_SPD_100FULL_		((u16)0x0018)
-
-#define LAN911X_INTERNAL_PHY_ID		(0x0007C000)
-
-/* Chip ID values */
-#define CHIP_9115	0x0115
-#define CHIP_9116	0x0116
-#define CHIP_9117	0x0117
-#define CHIP_9118	0x0118
-#define CHIP_9211	0x9211
-#define CHIP_9215	0x115A
-#define CHIP_9217	0x117A
-#define CHIP_9218	0x118A
-
-struct chip_id {
-	u16 id;
-	char *name;
-};
-
-static const struct chip_id chip_ids[] =  {
-	{ CHIP_9115, "LAN9115" },
-	{ CHIP_9116, "LAN9116" },
-	{ CHIP_9117, "LAN9117" },
-	{ CHIP_9118, "LAN9118" },
-	{ CHIP_9211, "LAN9211" },
-	{ CHIP_9215, "LAN9215" },
-	{ CHIP_9217, "LAN9217" },
-	{ CHIP_9218, "LAN9218" },
-	{ 0, NULL },
-};
-
-#define IS_REV_A(x)	((x & 0xFFFF)==0)
-
-/*
- * Macros to abstract register access according to the data bus
- * capabilities.  Please use those and not the in/out primitives.
- */
-/* FIFO read/write macros */
-#define SMC_PUSH_DATA(lp, p, l)	SMC_outsl( lp, TX_DATA_FIFO, p, (l) >> 2 )
-#define SMC_PULL_DATA(lp, p, l)	SMC_insl ( lp, RX_DATA_FIFO, p, (l) >> 2 )
-#define SMC_SET_TX_FIFO(lp, x) 	SMC_outl( x, lp, TX_DATA_FIFO )
-#define SMC_GET_RX_FIFO(lp)	SMC_inl( lp, RX_DATA_FIFO )
-
-
-/* I/O mapped register read/write macros */
-#define SMC_GET_TX_STS_FIFO(lp)		SMC_inl( lp, TX_STATUS_FIFO )
-#define SMC_GET_RX_STS_FIFO(lp)		SMC_inl( lp, RX_STATUS_FIFO )
-#define SMC_GET_RX_STS_FIFO_PEEK(lp)	SMC_inl( lp, RX_STATUS_FIFO_PEEK )
-#define SMC_GET_PN(lp)			(SMC_inl( lp, ID_REV ) >> 16)
-#define SMC_GET_REV(lp)			(SMC_inl( lp, ID_REV ) & 0xFFFF)
-#define SMC_GET_IRQ_CFG(lp)		SMC_inl( lp, INT_CFG )
-#define SMC_SET_IRQ_CFG(lp, x)		SMC_outl( x, lp, INT_CFG )
-#define SMC_GET_INT(lp)			SMC_inl( lp, INT_STS )
-#define SMC_ACK_INT(lp, x)			SMC_outl( x, lp, INT_STS )
-#define SMC_GET_INT_EN(lp)		SMC_inl( lp, INT_EN )
-#define SMC_SET_INT_EN(lp, x)		SMC_outl( x, lp, INT_EN )
-#define SMC_GET_BYTE_TEST(lp)		SMC_inl( lp, BYTE_TEST )
-#define SMC_SET_BYTE_TEST(lp, x)		SMC_outl( x, lp, BYTE_TEST )
-#define SMC_GET_FIFO_INT(lp)		SMC_inl( lp, FIFO_INT )
-#define SMC_SET_FIFO_INT(lp, x)		SMC_outl( x, lp, FIFO_INT )
-#define SMC_SET_FIFO_TDA(lp, x)					\
-	do {							\
-		unsigned long __flags;				\
-		int __mask;					\
-		local_irq_save(__flags);			\
-		__mask = SMC_GET_FIFO_INT((lp)) & ~(0xFF<<24);	\
-		SMC_SET_FIFO_INT( (lp), __mask | (x)<<24 );	\
-		local_irq_restore(__flags);			\
-	} while (0)
-#define SMC_SET_FIFO_TSL(lp, x)					\
-	do {							\
-		unsigned long __flags;				\
-		int __mask;					\
-		local_irq_save(__flags);			\
-		__mask = SMC_GET_FIFO_INT((lp)) & ~(0xFF<<16);	\
-		SMC_SET_FIFO_INT( (lp), __mask | (((x) & 0xFF)<<16));	\
-		local_irq_restore(__flags);			\
-	} while (0)
-#define SMC_SET_FIFO_RSA(lp, x)					\
-	do {							\
-		unsigned long __flags;				\
-		int __mask;					\
-		local_irq_save(__flags);			\
-		__mask = SMC_GET_FIFO_INT((lp)) & ~(0xFF<<8);	\
-		SMC_SET_FIFO_INT( (lp), __mask | (((x) & 0xFF)<<8));	\
-		local_irq_restore(__flags);			\
-	} while (0)
-#define SMC_SET_FIFO_RSL(lp, x)					\
-	do {							\
-		unsigned long __flags;				\
-		int __mask;					\
-		local_irq_save(__flags);			\
-		__mask = SMC_GET_FIFO_INT((lp)) & ~0xFF;	\
-		SMC_SET_FIFO_INT( (lp),__mask | ((x) & 0xFF));	\
-		local_irq_restore(__flags);			\
-	} while (0)
-#define SMC_GET_RX_CFG(lp)		SMC_inl( lp, RX_CFG )
-#define SMC_SET_RX_CFG(lp, x)		SMC_outl( x, lp, RX_CFG )
-#define SMC_GET_TX_CFG(lp)		SMC_inl( lp, TX_CFG )
-#define SMC_SET_TX_CFG(lp, x)		SMC_outl( x, lp, TX_CFG )
-#define SMC_GET_HW_CFG(lp)		SMC_inl( lp, HW_CFG )
-#define SMC_SET_HW_CFG(lp, x)		SMC_outl( x, lp, HW_CFG )
-#define SMC_GET_RX_DP_CTRL(lp)		SMC_inl( lp, RX_DP_CTRL )
-#define SMC_SET_RX_DP_CTRL(lp, x)		SMC_outl( x, lp, RX_DP_CTRL )
-#define SMC_GET_PMT_CTRL(lp)		SMC_inl( lp, PMT_CTRL )
-#define SMC_SET_PMT_CTRL(lp, x)		SMC_outl( x, lp, PMT_CTRL )
-#define SMC_GET_GPIO_CFG(lp)		SMC_inl( lp, GPIO_CFG )
-#define SMC_SET_GPIO_CFG(lp, x)		SMC_outl( x, lp, GPIO_CFG )
-#define SMC_GET_RX_FIFO_INF(lp)		SMC_inl( lp, RX_FIFO_INF )
-#define SMC_SET_RX_FIFO_INF(lp, x)		SMC_outl( x, lp, RX_FIFO_INF )
-#define SMC_GET_TX_FIFO_INF(lp)		SMC_inl( lp, TX_FIFO_INF )
-#define SMC_SET_TX_FIFO_INF(lp, x)		SMC_outl( x, lp, TX_FIFO_INF )
-#define SMC_GET_GPT_CFG(lp)		SMC_inl( lp, GPT_CFG )
-#define SMC_SET_GPT_CFG(lp, x)		SMC_outl( x, lp, GPT_CFG )
-#define SMC_GET_RX_DROP(lp)		SMC_inl( lp, RX_DROP )
-#define SMC_SET_RX_DROP(lp, x)		SMC_outl( x, lp, RX_DROP )
-#define SMC_GET_MAC_CMD(lp)		SMC_inl( lp, MAC_CSR_CMD )
-#define SMC_SET_MAC_CMD(lp, x)		SMC_outl( x, lp, MAC_CSR_CMD )
-#define SMC_GET_MAC_DATA(lp)		SMC_inl( lp, MAC_CSR_DATA )
-#define SMC_SET_MAC_DATA(lp, x)		SMC_outl( x, lp, MAC_CSR_DATA )
-#define SMC_GET_AFC_CFG(lp)		SMC_inl( lp, AFC_CFG )
-#define SMC_SET_AFC_CFG(lp, x)		SMC_outl( x, lp, AFC_CFG )
-#define SMC_GET_E2P_CMD(lp)		SMC_inl( lp, E2P_CMD )
-#define SMC_SET_E2P_CMD(lp, x)		SMC_outl( x, lp, E2P_CMD )
-#define SMC_GET_E2P_DATA(lp)		SMC_inl( lp, E2P_DATA )
-#define SMC_SET_E2P_DATA(lp, x)		SMC_outl( x, lp, E2P_DATA )
-
-/* MAC register read/write macros */
-#define SMC_GET_MAC_CSR(lp,a,v)						\
-	do {								\
-		while (SMC_GET_MAC_CMD((lp)) & MAC_CSR_CMD_CSR_BUSY_);	\
-		SMC_SET_MAC_CMD((lp),MAC_CSR_CMD_CSR_BUSY_ |		\
-			MAC_CSR_CMD_R_NOT_W_ | (a) );			\
-		while (SMC_GET_MAC_CMD((lp)) & MAC_CSR_CMD_CSR_BUSY_);	\
-		v = SMC_GET_MAC_DATA((lp));			       	\
-	} while (0)
-#define SMC_SET_MAC_CSR(lp,a,v)						\
-	do {								\
-		while (SMC_GET_MAC_CMD((lp)) & MAC_CSR_CMD_CSR_BUSY_);	\
-		SMC_SET_MAC_DATA((lp), v);				\
-		SMC_SET_MAC_CMD((lp), MAC_CSR_CMD_CSR_BUSY_ | (a) );	\
-		while (SMC_GET_MAC_CMD((lp)) & MAC_CSR_CMD_CSR_BUSY_);	\
-	} while (0)
-#define SMC_GET_MAC_CR(lp, x)	SMC_GET_MAC_CSR( (lp), MAC_CR, x )
-#define SMC_SET_MAC_CR(lp, x)	SMC_SET_MAC_CSR( (lp), MAC_CR, x )
-#define SMC_GET_ADDRH(lp, x)	SMC_GET_MAC_CSR( (lp), ADDRH, x )
-#define SMC_SET_ADDRH(lp, x)	SMC_SET_MAC_CSR( (lp), ADDRH, x )
-#define SMC_GET_ADDRL(lp, x)	SMC_GET_MAC_CSR( (lp), ADDRL, x )
-#define SMC_SET_ADDRL(lp, x)	SMC_SET_MAC_CSR( (lp), ADDRL, x )
-#define SMC_GET_HASHH(lp, x)	SMC_GET_MAC_CSR( (lp), HASHH, x )
-#define SMC_SET_HASHH(lp, x)	SMC_SET_MAC_CSR( (lp), HASHH, x )
-#define SMC_GET_HASHL(lp, x)	SMC_GET_MAC_CSR( (lp), HASHL, x )
-#define SMC_SET_HASHL(lp, x)	SMC_SET_MAC_CSR( (lp), HASHL, x )
-#define SMC_GET_MII_ACC(lp, x)	SMC_GET_MAC_CSR( (lp), MII_ACC, x )
-#define SMC_SET_MII_ACC(lp, x)	SMC_SET_MAC_CSR( (lp), MII_ACC, x )
-#define SMC_GET_MII_DATA(lp, x)	SMC_GET_MAC_CSR( (lp), MII_DATA, x )
-#define SMC_SET_MII_DATA(lp, x)	SMC_SET_MAC_CSR( (lp), MII_DATA, x )
-#define SMC_GET_FLOW(lp, x)		SMC_GET_MAC_CSR( (lp), FLOW, x )
-#define SMC_SET_FLOW(lp, x)		SMC_SET_MAC_CSR( (lp), FLOW, x )
-#define SMC_GET_VLAN1(lp, x)	SMC_GET_MAC_CSR( (lp), VLAN1, x )
-#define SMC_SET_VLAN1(lp, x)	SMC_SET_MAC_CSR( (lp), VLAN1, x )
-#define SMC_GET_VLAN2(lp, x)	SMC_GET_MAC_CSR( (lp), VLAN2, x )
-#define SMC_SET_VLAN2(lp, x)	SMC_SET_MAC_CSR( (lp), VLAN2, x )
-#define SMC_SET_WUFF(lp, x)		SMC_SET_MAC_CSR( (lp), WUFF, x )
-#define SMC_GET_WUCSR(lp, x)	SMC_GET_MAC_CSR( (lp), WUCSR, x )
-#define SMC_SET_WUCSR(lp, x)	SMC_SET_MAC_CSR( (lp), WUCSR, x )
-
-/* PHY register read/write macros */
-#define SMC_GET_MII(lp,a,phy,v)					\
-	do {							\
-		u32 __v;					\
-		do {						\
-			SMC_GET_MII_ACC((lp), __v);			\
-		} while ( __v & MII_ACC_MII_BUSY_ );		\
-		SMC_SET_MII_ACC( (lp), ((phy)<<11) | ((a)<<6) |	\
-			MII_ACC_MII_BUSY_);			\
-		do {						\
-			SMC_GET_MII_ACC( (lp), __v);			\
-		} while ( __v & MII_ACC_MII_BUSY_ );		\
-		SMC_GET_MII_DATA((lp), v);				\
-	} while (0)
-#define SMC_SET_MII(lp,a,phy,v)					\
-	do {							\
-		u32 __v;					\
-		do {						\
-			SMC_GET_MII_ACC((lp), __v);			\
-		} while ( __v & MII_ACC_MII_BUSY_ );		\
-		SMC_SET_MII_DATA((lp), v);				\
-		SMC_SET_MII_ACC( (lp), ((phy)<<11) | ((a)<<6) |	\
-			MII_ACC_MII_BUSY_	 |		\
-			MII_ACC_MII_WRITE_  );			\
-		do {						\
-			SMC_GET_MII_ACC((lp), __v);			\
-		} while ( __v & MII_ACC_MII_BUSY_ );		\
-	} while (0)
-#define SMC_GET_PHY_BMCR(lp,phy,x)		SMC_GET_MII( (lp), MII_BMCR, phy, x )
-#define SMC_SET_PHY_BMCR(lp,phy,x)		SMC_SET_MII( (lp), MII_BMCR, phy, x )
-#define SMC_GET_PHY_BMSR(lp,phy,x)		SMC_GET_MII( (lp), MII_BMSR, phy, x )
-#define SMC_GET_PHY_ID1(lp,phy,x)		SMC_GET_MII( (lp), MII_PHYSID1, phy, x )
-#define SMC_GET_PHY_ID2(lp,phy,x)		SMC_GET_MII( (lp), MII_PHYSID2, phy, x )
-#define SMC_GET_PHY_MII_ADV(lp,phy,x)	SMC_GET_MII( (lp), MII_ADVERTISE, phy, x )
-#define SMC_SET_PHY_MII_ADV(lp,phy,x)	SMC_SET_MII( (lp), MII_ADVERTISE, phy, x )
-#define SMC_GET_PHY_MII_LPA(lp,phy,x)	SMC_GET_MII( (lp), MII_LPA, phy, x )
-#define SMC_SET_PHY_MII_LPA(lp,phy,x)	SMC_SET_MII( (lp), MII_LPA, phy, x )
-#define SMC_GET_PHY_CTRL_STS(lp,phy,x)	SMC_GET_MII( (lp), PHY_MODE_CTRL_STS, phy, x )
-#define SMC_SET_PHY_CTRL_STS(lp,phy,x)	SMC_SET_MII( (lp), PHY_MODE_CTRL_STS, phy, x )
-#define SMC_GET_PHY_INT_SRC(lp,phy,x)	SMC_GET_MII( (lp), PHY_INT_SRC, phy, x )
-#define SMC_SET_PHY_INT_SRC(lp,phy,x)	SMC_SET_MII( (lp), PHY_INT_SRC, phy, x )
-#define SMC_GET_PHY_INT_MASK(lp,phy,x)	SMC_GET_MII( (lp), PHY_INT_MASK, phy, x )
-#define SMC_SET_PHY_INT_MASK(lp,phy,x)	SMC_SET_MII( (lp), PHY_INT_MASK, phy, x )
-#define SMC_GET_PHY_SPECIAL(lp,phy,x)	SMC_GET_MII( (lp), PHY_SPECIAL, phy, x )
-
-
-
-/* Misc read/write macros */
-
-#ifndef SMC_GET_MAC_ADDR
-#define SMC_GET_MAC_ADDR(lp, addr)				\
-	do {							\
-		unsigned int __v;				\
-								\
-		SMC_GET_MAC_CSR((lp), ADDRL, __v);			\
-		addr[0] = __v; addr[1] = __v >> 8;		\
-		addr[2] = __v >> 16; addr[3] = __v >> 24;	\
-		SMC_GET_MAC_CSR((lp), ADDRH, __v);			\
-		addr[4] = __v; addr[5] = __v >> 8;		\
-	} while (0)
-#endif
-
-#define SMC_SET_MAC_ADDR(lp, addr)				\
-	do {							\
-		 SMC_SET_MAC_CSR((lp), ADDRL,				\
-				 addr[0] |			\
-				(addr[1] << 8) |		\
-				(addr[2] << 16) |		\
-				(addr[3] << 24));		\
-		 SMC_SET_MAC_CSR((lp), ADDRH, addr[4]|(addr[5] << 8));\
-	} while (0)
-
-
-#define SMC_WRITE_EEPROM_CMD(lp, cmd, addr)				\
-	do {								\
-		while (SMC_GET_E2P_CMD((lp)) & MAC_CSR_CMD_CSR_BUSY_);	\
-		SMC_SET_MAC_CMD((lp), MAC_CSR_CMD_R_NOT_W_ | a );		\
-		while (SMC_GET_MAC_CMD((lp)) & MAC_CSR_CMD_CSR_BUSY_);	\
-	} while (0)
-
-#endif	 /* _SMC911X_H_ */
diff --git a/include/linux/smc911x.h b/include/linux/smc911x.h
deleted file mode 100644
index 8cace8189e74..000000000000
--- a/include/linux/smc911x.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __SMC911X_H__
-#define __SMC911X_H__
-
-#define SMC911X_USE_16BIT (1 << 0)
-#define SMC911X_USE_32BIT (1 << 1)
-
-struct smc911x_platdata {
-	unsigned long flags;
-	unsigned long irq_flags; /* IRQF_... */
-	int irq_polarity;
-};
-
-#endif /* __SMC911X_H__ */
-- 
cgit v1.2.3


From f392a1846489720fc2e063d1210633b6cf4ec5a4 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 17 Oct 2022 16:22:35 -0400
Subject: net: phylink: provide phylink_validate_mask_caps() helper

Provide a helper that restricts the link modes according to the
phylink capabilities.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
[rebased on net-next/master and added documentation]
Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 41 +++++++++++++++++++++++++++++------------
 include/linux/phylink.h   |  3 +++
 2 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 75464df191ef..ef10f5a70e5a 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -562,31 +562,48 @@ unsigned long phylink_get_capabilities(phy_interface_t interface,
 EXPORT_SYMBOL_GPL(phylink_get_capabilities);
 
 /**
- * phylink_generic_validate() - generic validate() callback implementation
- * @config: a pointer to a &struct phylink_config.
+ * phylink_validate_mask_caps() - Restrict link modes based on caps
  * @supported: ethtool bitmask for supported link modes.
- * @state: a pointer to a &struct phylink_link_state.
+ * @state: an (optional) pointer to a &struct phylink_link_state.
+ * @mac_capabilities: bitmask of MAC capabilities
  *
- * Generic implementation of the validate() callback that MAC drivers can
- * use when they pass the range of supported interfaces and MAC capabilities.
- * This makes use of phylink_get_linkmodes().
+ * Calculate the supported link modes based on @mac_capabilities, and restrict
+ * @supported and @state based on that. Use this function if your capabiliies
+ * aren't constant, such as if they vary depending on the interface.
  */
-void phylink_generic_validate(struct phylink_config *config,
-			      unsigned long *supported,
-			      struct phylink_link_state *state)
+void phylink_validate_mask_caps(unsigned long *supported,
+				struct phylink_link_state *state,
+				unsigned long mac_capabilities)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
 	unsigned long caps;
 
 	phylink_set_port_modes(mask);
 	phylink_set(mask, Autoneg);
-	caps = phylink_get_capabilities(state->interface,
-					config->mac_capabilities,
+	caps = phylink_get_capabilities(state->interface, mac_capabilities,
 					state->rate_matching);
 	phylink_caps_to_linkmodes(mask, caps);
 
 	linkmode_and(supported, supported, mask);
-	linkmode_and(state->advertising, state->advertising, mask);
+	if (state)
+		linkmode_and(state->advertising, state->advertising, mask);
+}
+EXPORT_SYMBOL_GPL(phylink_validate_mask_caps);
+
+/**
+ * phylink_generic_validate() - generic validate() callback implementation
+ * @config: a pointer to a &struct phylink_config.
+ * @supported: ethtool bitmask for supported link modes.
+ * @state: a pointer to a &struct phylink_link_state.
+ *
+ * Generic implementation of the validate() callback that MAC drivers can
+ * use when they pass the range of supported interfaces and MAC capabilities.
+ */
+void phylink_generic_validate(struct phylink_config *config,
+			      unsigned long *supported,
+			      struct phylink_link_state *state)
+{
+	phylink_validate_mask_caps(supported, state, config->mac_capabilities);
 }
 EXPORT_SYMBOL_GPL(phylink_generic_validate);
 
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 664dd409feb9..c29c3f174972 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -556,6 +556,9 @@ void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps);
 unsigned long phylink_get_capabilities(phy_interface_t interface,
 				       unsigned long mac_capabilities,
 				       int rate_matching);
+void phylink_validate_mask_caps(unsigned long *supported,
+				struct phylink_link_state *state,
+				unsigned long caps);
 void phylink_generic_validate(struct phylink_config *config,
 			      unsigned long *supported,
 			      struct phylink_link_state *state);
-- 
cgit v1.2.3


From 51c352bdbcd23d7ce46b06c1e64c82754dc44044 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree.xilinx@gmail.com>
Date: Tue, 18 Oct 2022 15:37:27 +0100
Subject: netlink: add support for formatted extack messages

Include an 80-byte buffer in struct netlink_ext_ack that can be used
 for scnprintf()ed messages.  This does mean that the resulting string
 can't be enumerated, translated etc. in the way NL_SET_ERR_MSG() was
 designed to allow.

Signed-off-by: Edward Cree <ecree.xilinx@gmail.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netlink.h | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index d51e041d2242..d81bde5a5844 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -64,6 +64,7 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
 
 /* this can be increased when necessary - don't expose to userland */
 #define NETLINK_MAX_COOKIE_LEN	20
+#define NETLINK_MAX_FMTMSG_LEN	80
 
 /**
  * struct netlink_ext_ack - netlink extended ACK report struct
@@ -75,6 +76,8 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
  * @miss_nest: nest missing an attribute (%NULL if missing top level attr)
  * @cookie: cookie data to return to userspace (for success)
  * @cookie_len: actual cookie data length
+ * @_msg_buf: output buffer for formatted message strings - don't access
+ *	directly, use %NL_SET_ERR_MSG_FMT
  */
 struct netlink_ext_ack {
 	const char *_msg;
@@ -84,13 +87,13 @@ struct netlink_ext_ack {
 	u16 miss_type;
 	u8 cookie[NETLINK_MAX_COOKIE_LEN];
 	u8 cookie_len;
+	char _msg_buf[NETLINK_MAX_FMTMSG_LEN];
 };
 
 /* Always use this macro, this allows later putting the
  * message into a separate section or such for things
  * like translation or listing all possible messages.
- * Currently string formatting is not supported (due
- * to the lack of an output buffer.)
+ * If string formatting is needed use NL_SET_ERR_MSG_FMT.
  */
 #define NL_SET_ERR_MSG(extack, msg) do {		\
 	static const char __msg[] = msg;		\
@@ -102,9 +105,31 @@ struct netlink_ext_ack {
 		__extack->_msg = __msg;			\
 } while (0)
 
+/* We splice fmt with %s at each end even in the snprintf so that both calls
+ * can use the same string constant, avoiding its duplication in .ro
+ */
+#define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do {			       \
+	struct netlink_ext_ack *__extack = (extack);			       \
+									       \
+	if (!__extack)							       \
+		break;							       \
+	if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN,	       \
+		     "%s" fmt "%s", "", ##args, "") >=			       \
+	    NETLINK_MAX_FMTMSG_LEN)					       \
+		net_warn_ratelimited("%s" fmt "%s", "truncated extack: ",      \
+				     ##args, "\n");			       \
+									       \
+	do_trace_netlink_extack(__extack->_msg_buf);			       \
+									       \
+	__extack->_msg = __extack->_msg_buf;				       \
+} while (0)
+
 #define NL_SET_ERR_MSG_MOD(extack, msg)			\
 	NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg)
 
+#define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...)	\
+	NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args)
+
 #define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do {	\
 	if ((extack)) {					\
 		(extack)->bad_attr = (attr);		\
-- 
cgit v1.2.3


From a5ef058dc4d9a3e60d1808a0700e18e0e37e408e Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 20 Oct 2022 19:48:51 +0200
Subject: net: introduce and use custom sockopt socket flag

We will soon introduce custom setsockopt for UDP sockets, too.
Instead of doing even more complex arbitrary checks inside
sock_use_custom_sol_socket(), add a new socket flag and set it
for the relevant socket types (currently only MPTCP).

Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h  | 1 +
 net/mptcp/protocol.c | 4 ++++
 net/socket.c         | 8 +-------
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 711c3593c3b8..59350fd85823 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -41,6 +41,7 @@ struct net;
 #define SOCK_NOSPACE		2
 #define SOCK_PASSCRED		3
 #define SOCK_PASSSEC		4
+#define SOCK_CUSTOM_SOCKOPT	5
 
 #ifndef ARCH_HAS_SOCKET_TYPES
 /**
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 2e16c897c229..e60d144bfb2d 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2708,6 +2708,8 @@ static int mptcp_init_sock(struct sock *sk)
 	if (ret)
 		return ret;
 
+	set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
+
 	/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
 	 * propagate the correct value
 	 */
@@ -3684,6 +3686,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
 		struct mptcp_subflow_context *subflow;
 		struct sock *newsk = newsock->sk;
 
+		set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
+
 		lock_sock(newsk);
 
 		/* PM/worker can now acquire the first subflow socket
diff --git a/net/socket.c b/net/socket.c
index 00da9ce3dba0..55c5d536e5f6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2199,13 +2199,7 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
 
 static bool sock_use_custom_sol_socket(const struct socket *sock)
 {
-	const struct sock *sk = sock->sk;
-
-	/* Use sock->ops->setsockopt() for MPTCP */
-	return IS_ENABLED(CONFIG_MPTCP) &&
-	       sk->sk_protocol == IPPROTO_MPTCP &&
-	       sk->sk_type == SOCK_STREAM &&
-	       (sk->sk_family == AF_INET || sk->sk_family == AF_INET6);
+	return test_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags);
 }
 
 /*
-- 
cgit v1.2.3


From 8a3854c7b8e4532063b14bed34115079b7d0cb36 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 20 Oct 2022 19:48:52 +0200
Subject: udp: track the forward memory release threshold in an hot cacheline

When the receiver process and the BH runs on different cores,
udp_rmem_release() experience a cache miss while accessing sk_rcvbuf,
as the latter shares the same cacheline with sk_forward_alloc, written
by the BH.

With this patch, UDP tracks the rcvbuf value and its update via custom
SOL_SOCKET socket options, and copies the forward memory threshold value
used by udp_rmem_release() in a different cacheline, already accessed by
the above function and uncontended.

Since the UDP socket init operation grown a bit, factor out the common
code between v4 and v6 in a shared helper.

Overall the above give a 10% peek throughput increase under UDP flood.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h |  3 +++
 include/net/udp.h   |  9 +++++++++
 net/ipv4/udp.c      | 18 +++++++++++++++---
 net/ipv6/udp.c      |  4 ++--
 4 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index e96da4157d04..5cdba00a904a 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -87,6 +87,9 @@ struct udp_sock {
 
 	/* This field is dirtied by udp_recvmsg() */
 	int		forward_deficit;
+
+	/* This fields follows rcvbuf value, and is touched by udp_recvmsg */
+	int		forward_threshold;
 };
 
 #define UDP_MAX_SEGMENTS	(1 << 6UL)
diff --git a/include/net/udp.h b/include/net/udp.h
index fee053bcd17c..de4b528522bb 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -174,6 +174,15 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *));
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  netdev_features_t features, bool is_ipv6);
 
+static inline void udp_lib_init_sock(struct sock *sk)
+{
+	struct udp_sock *up = udp_sk(sk);
+
+	skb_queue_head_init(&up->reader_queue);
+	up->forward_threshold = sk->sk_rcvbuf >> 2;
+	set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
+}
+
 /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
 static inline int udp_lib_hash(struct sock *sk)
 {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c83e5271030b..e77c8f0e9087 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1448,7 +1448,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
 	if (likely(partial)) {
 		up->forward_deficit += size;
 		size = up->forward_deficit;
-		if (size < (sk->sk_rcvbuf >> 2) &&
+		if (size < READ_ONCE(up->forward_threshold) &&
 		    !skb_queue_empty(&up->reader_queue))
 			return;
 	} else {
@@ -1622,7 +1622,7 @@ static void udp_destruct_sock(struct sock *sk)
 
 int udp_init_sock(struct sock *sk)
 {
-	skb_queue_head_init(&udp_sk(sk)->reader_queue);
+	udp_lib_init_sock(sk);
 	sk->sk_destruct = udp_destruct_sock;
 	return 0;
 }
@@ -2671,6 +2671,18 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 	int err = 0;
 	int is_udplite = IS_UDPLITE(sk);
 
+	if (level == SOL_SOCKET) {
+		err = sk_setsockopt(sk, level, optname, optval, optlen);
+
+		if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) {
+			sockopt_lock_sock(sk);
+			/* paired with READ_ONCE in udp_rmem_release() */
+			WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2);
+			sockopt_release_sock(sk);
+		}
+		return err;
+	}
+
 	if (optlen < sizeof(int))
 		return -EINVAL;
 
@@ -2784,7 +2796,7 @@ EXPORT_SYMBOL(udp_lib_setsockopt);
 int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
 		   unsigned int optlen)
 {
-	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE || level == SOL_SOCKET)
 		return udp_lib_setsockopt(sk, level, optname,
 					  optval, optlen,
 					  udp_push_pending_frames);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2260406740d3..297f7cc06044 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -64,7 +64,7 @@ static void udpv6_destruct_sock(struct sock *sk)
 
 int udpv6_init_sock(struct sock *sk)
 {
-	skb_queue_head_init(&udp_sk(sk)->reader_queue);
+	udp_lib_init_sock(sk);
 	sk->sk_destruct = udpv6_destruct_sock;
 	return 0;
 }
@@ -1669,7 +1669,7 @@ void udpv6_destroy_sock(struct sock *sk)
 int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
 		     unsigned int optlen)
 {
-	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE || level == SOL_SOCKET)
 		return udp_lib_setsockopt(sk, level, optname,
 					  optval, optlen,
 					  udp_v6_push_pending_frames);
-- 
cgit v1.2.3


From 4727bab4e9bbeafeff6acdfcb077a7a548cbde30 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 21 Oct 2022 10:58:22 +0800
Subject: net: skb: move skb_pp_recycle() to skbuff.c

skb_pp_recycle() is only used by skb_free_head() in
skbuff.c, so move it to skbuff.c.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 7 -------
 net/core/skbuff.c      | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7be5bb4c94b6..59c9fd55699d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5050,12 +5050,5 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
 }
 #endif
 
-static inline bool skb_pp_recycle(struct sk_buff *skb, void *data)
-{
-	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
-		return false;
-	return page_pool_return_skb_page(virt_to_page(data));
-}
-
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1d9719e72f9d..9b3b19816d2d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -748,6 +748,13 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 		skb_get(list);
 }
 
+static bool skb_pp_recycle(struct sk_buff *skb, void *data)
+{
+	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
+		return false;
+	return page_pool_return_skb_page(virt_to_page(data));
+}
+
 static void skb_free_head(struct sk_buff *skb)
 {
 	unsigned char *head = skb->head;
-- 
cgit v1.2.3


From 398900498485fa9465386454681763d81efaad25 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 21 Oct 2022 16:09:59 +0100
Subject: net: sfp: provide a definition for the power level select bit

Provide a named definition for the power level select bit in the
extended status register, rather than using BIT(0) in the code.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/sfp.c | 6 +++---
 include/linux/sfp.h   | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index af676e28ba6a..16bce0ea68d9 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -1837,13 +1837,13 @@ static int sfp_sm_mod_hpower(struct sfp *sfp, bool enable)
 	 * all bytes 0xff) at 0x51 but does not accept writes.  In any case,
 	 * if the bit is already set, we're already in high power mode.
 	 */
-	if (!!(val & BIT(0)) == enable)
+	if (!!(val & SFP_EXT_STATUS_PWRLVL_SELECT) == enable)
 		return 0;
 
 	if (enable)
-		val |= BIT(0);
+		val |= SFP_EXT_STATUS_PWRLVL_SELECT;
 	else
-		val &= ~BIT(0);
+		val &= ~SFP_EXT_STATUS_PWRLVL_SELECT;
 
 	err = sfp_write(sfp, true, SFP_EXT_STATUS, &val, sizeof(val));
 	if (err != sizeof(val)) {
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index d1f343853b6c..01ae9f1dd2ad 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -489,6 +489,8 @@ enum {
 	SFP_WARN1_RXPWR_LOW		= BIT(6),
 
 	SFP_EXT_STATUS			= 0x76,
+	SFP_EXT_STATUS_PWRLVL_SELECT	= BIT(0),
+
 	SFP_VSL				= 0x78,
 	SFP_PAGE			= 0x7f,
 };
-- 
cgit v1.2.3


From 73feb8d5fa3b755bb51077c0aabfb6aa556fd498 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 25 Oct 2022 15:41:41 +0200
Subject: kallsyms: Make module_kallsyms_on_each_symbol generally available

Making module_kallsyms_on_each_symbol generally available, so it
can be used outside CONFIG_LIVEPATCH option in following changes.

Rather than adding another ifdef option let's make the function
generally available (when CONFIG_KALLSYMS and CONFIG_MODULES
options are defined).

Cc: Christoph Hellwig <hch@lst.de>
Acked-by: Song Liu <song@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20221025134148.3300700-2-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/module.h   | 9 +++++++++
 kernel/module/kallsyms.c | 2 --
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index ec61fb53979a..35876e89eb93 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -879,8 +879,17 @@ static inline bool module_sig_ok(struct module *module)
 }
 #endif	/* CONFIG_MODULE_SIG */
 
+#if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS)
 int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 					     struct module *, unsigned long),
 				   void *data);
+#else
+static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+						 struct module *, unsigned long),
+						 void *data)
+{
+	return -EOPNOTSUPP;
+}
+#endif  /* CONFIG_MODULES && CONFIG_KALLSYMS */
 
 #endif /* _LINUX_MODULE_H */
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index f5c5c9175333..4523f99b0358 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -494,7 +494,6 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 	return ret;
 }
 
-#ifdef CONFIG_LIVEPATCH
 int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 					     struct module *, unsigned long),
 				   void *data)
@@ -531,4 +530,3 @@ out:
 	mutex_unlock(&module_mutex);
 	return ret;
 }
-#endif /* CONFIG_LIVEPATCH */
-- 
cgit v1.2.3


From b5f0de6df6dce8d641ef58ef7012f3304dffb9a1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 18 Oct 2022 02:56:03 -0700
Subject: net: dev: Convert sa_data to flexible array in struct sockaddr

One of the worst offenders of "fake flexible arrays" is struct sockaddr,
as it is the classic example of why GCC and Clang have been traditionally
forced to treat all trailing arrays as fake flexible arrays: in the
distant misty past, sa_data became too small, and code started just
treating it as a flexible array, even though it was fixed-size. The
special case by the compiler is specifically that sizeof(sa->sa_data)
and FORTIFY_SOURCE (which uses __builtin_object_size(sa->sa_data, 1))
do not agree (14 and -1 respectively), which makes FORTIFY_SOURCE treat
it as a flexible array.

However, the coming -fstrict-flex-arrays compiler flag will remove
these special cases so that FORTIFY_SOURCE can gain coverage over all
the trailing arrays in the kernel that are _not_ supposed to be treated
as a flexible array. To deal with this change, convert sa_data to a true
flexible array. To keep the structure size the same, move sa_data into
a union with a newly introduced sa_data_min with the original size. The
result is that FORTIFY_SOURCE can continue to have no idea how large
sa_data may actually be, but anything using sizeof(sa->sa_data) must
switch to sizeof(sa->sa_data_min).

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: David Ahern <dsahern@kernel.org>
Cc: Dylan Yudaken <dylany@fb.com>
Cc: Yajun Deng <yajun.deng@linux.dev>
Cc: Petr Machata <petrm@nvidia.com>
Cc: Hangbin Liu <liuhangbin@gmail.com>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: syzbot <syzkaller@googlegroups.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20221018095503.never.671-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/socket.h |  5 ++++-
 net/core/dev.c         |  2 +-
 net/core/dev_ioctl.c   |  2 +-
 net/packet/af_packet.c | 10 +++++-----
 4 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index de3701a2a212..13c3a237b9c9 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -33,7 +33,10 @@ typedef __kernel_sa_family_t	sa_family_t;
 
 struct sockaddr {
 	sa_family_t	sa_family;	/* address family, AF_xxx	*/
-	char		sa_data[14];	/* 14 bytes of protocol address	*/
+	union {
+		char sa_data_min[14];		/* Minimum 14 bytes of protocol address	*/
+		DECLARE_FLEX_ARRAY(char, sa_data);
+	};
 };
 
 struct linger {
diff --git a/net/core/dev.c b/net/core/dev.c
index 3be256051e99..fff62068a53d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8822,7 +8822,7 @@ EXPORT_SYMBOL(dev_set_mac_address_user);
 
 int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
 {
-	size_t size = sizeof(sa->sa_data);
+	size_t size = sizeof(sa->sa_data_min);
 	struct net_device *dev;
 	int ret = 0;
 
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 7674bb9f3076..5cdbfbf9a7dc 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -342,7 +342,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
 		if (ifr->ifr_hwaddr.sa_family != dev->type)
 			return -EINVAL;
 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
-		       min(sizeof(ifr->ifr_hwaddr.sa_data),
+		       min(sizeof(ifr->ifr_hwaddr.sa_data_min),
 			   (size_t)dev->addr_len));
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 		return 0;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 6ce8dd19f33c..8c5b3da0c29f 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3277,7 +3277,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
 			    int addr_len)
 {
 	struct sock *sk = sock->sk;
-	char name[sizeof(uaddr->sa_data) + 1];
+	char name[sizeof(uaddr->sa_data_min) + 1];
 
 	/*
 	 *	Check legality
@@ -3288,8 +3288,8 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
 	/* uaddr->sa_data comes from the userspace, it's not guaranteed to be
 	 * zero-terminated.
 	 */
-	memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
-	name[sizeof(uaddr->sa_data)] = 0;
+	memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min));
+	name[sizeof(uaddr->sa_data_min)] = 0;
 
 	return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
 }
@@ -3561,11 +3561,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
 		return -EOPNOTSUPP;
 
 	uaddr->sa_family = AF_PACKET;
-	memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
+	memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min));
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
 	if (dev)
-		strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
+		strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min));
 	rcu_read_unlock();
 
 	return sizeof(*uaddr);
-- 
cgit v1.2.3


From 271de525e1d7f564e88a9d212c50998b49a54476 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 25 Oct 2022 11:45:16 -0700
Subject: bpf: Remove prog->active check for bpf_lsm and bpf_iter

The commit 64696c40d03c ("bpf: Add __bpf_prog_{enter,exit}_struct_ops for struct_ops trampoline")
removed prog->active check for struct_ops prog.  The bpf_lsm
and bpf_iter is also using trampoline.  Like struct_ops, the bpf_lsm
and bpf_iter have fixed hooks for the prog to attach.  The
kernel does not call the same hook in a recursive way.
This patch also removes the prog->active check for
bpf_lsm and bpf_iter.

A later patch has a test to reproduce the recursion issue
for a sleepable bpf_lsm program.

This patch appends the '_recur' naming to the existing
enter and exit functions that track the prog->active counter.
New __bpf_prog_{enter,exit}[_sleepable] function are
added to skip the prog->active tracking. The '_struct_ops'
version is also removed.

It also moves the decision on picking the enter and exit function to
the new bpf_trampoline_{enter,exit}().  It returns the '_recur' ones
for all tracing progs to use.  For bpf_lsm, bpf_iter,
struct_ops (no prog->active tracking after 64696c40d03c), and
bpf_lsm_cgroup (no prog->active tracking after 69fd337a975c7),
it will return the functions that don't track the prog->active.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20221025184524.3526117-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c |  9 ++---
 arch/x86/net/bpf_jit_comp.c   | 19 ++--------
 include/linux/bpf.h           | 24 ++++++-------
 include/linux/bpf_verifier.h  | 15 +++++++-
 kernel/bpf/syscall.c          |  5 +--
 kernel/bpf/trampoline.c       | 80 ++++++++++++++++++++++++++++++++++++-------
 6 files changed, 98 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 30f76178608b..62f805f427b7 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1649,13 +1649,8 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
 	struct bpf_prog *p = l->link.prog;
 	int cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
 
-	if (p->aux->sleepable) {
-		enter_prog = (u64)__bpf_prog_enter_sleepable;
-		exit_prog = (u64)__bpf_prog_exit_sleepable;
-	} else {
-		enter_prog = (u64)__bpf_prog_enter;
-		exit_prog = (u64)__bpf_prog_exit;
-	}
+	enter_prog = (u64)bpf_trampoline_enter(p);
+	exit_prog = (u64)bpf_trampoline_exit(p);
 
 	if (l->cookie == 0) {
 		/* if cookie is zero, one instruction is enough to store it */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d7dd8e0db8da..36ffe67ad6e5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1894,10 +1894,6 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 			   struct bpf_tramp_link *l, int stack_size,
 			   int run_ctx_off, bool save_ret)
 {
-	void (*exit)(struct bpf_prog *prog, u64 start,
-		     struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_exit;
-	u64 (*enter)(struct bpf_prog *prog,
-		     struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_enter;
 	u8 *prog = *pprog;
 	u8 *jmp_insn;
 	int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
@@ -1916,23 +1912,12 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	 */
 	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off);
 
-	if (p->aux->sleepable) {
-		enter = __bpf_prog_enter_sleepable;
-		exit = __bpf_prog_exit_sleepable;
-	} else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) {
-		enter = __bpf_prog_enter_struct_ops;
-		exit = __bpf_prog_exit_struct_ops;
-	} else if (p->expected_attach_type == BPF_LSM_CGROUP) {
-		enter = __bpf_prog_enter_lsm_cgroup;
-		exit = __bpf_prog_exit_lsm_cgroup;
-	}
-
 	/* arg1: mov rdi, progs[i] */
 	emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
 	/* arg2: lea rsi, [rbp - ctx_cookie_off] */
 	EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
 
-	if (emit_call(&prog, enter, prog))
+	if (emit_call(&prog, bpf_trampoline_enter(p), prog))
 		return -EINVAL;
 	/* remember prog start time returned by __bpf_prog_enter */
 	emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
@@ -1977,7 +1962,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
 	/* arg3: lea rdx, [rbp - run_ctx_off] */
 	EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
-	if (emit_call(&prog, exit, prog))
+	if (emit_call(&prog, bpf_trampoline_exit(p), prog))
 		return -EINVAL;
 
 	*pprog = prog;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9e7d46d16032..1279e699dc98 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -854,22 +854,18 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *i
 				const struct btf_func_model *m, u32 flags,
 				struct bpf_tramp_links *tlinks,
 				void *orig_call);
-/* these two functions are called from generated trampoline */
-u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx);
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx);
-u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx);
-void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
-				       struct bpf_tramp_run_ctx *run_ctx);
-u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
-					struct bpf_tramp_run_ctx *run_ctx);
-void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
-					struct bpf_tramp_run_ctx *run_ctx);
-u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
-					struct bpf_tramp_run_ctx *run_ctx);
-void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
-					struct bpf_tramp_run_ctx *run_ctx);
+u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
+					     struct bpf_tramp_run_ctx *run_ctx);
+void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
+					     struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
 void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
+typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog,
+				      struct bpf_tramp_run_ctx *run_ctx);
+typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start,
+				      struct bpf_tramp_run_ctx *run_ctx);
+bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog);
+bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog);
 
 struct bpf_ksym {
 	unsigned long		 start;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 9e1e6965f407..1a32baa78ce2 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -642,10 +642,23 @@ static inline u32 type_flag(u32 type)
 }
 
 /* only use after check_attach_btf_id() */
-static inline enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
+static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog)
 {
 	return prog->type == BPF_PROG_TYPE_EXT ?
 		prog->aux->dst_prog->type : prog->type;
 }
 
+static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
+{
+	switch (resolve_prog_type(prog)) {
+	case BPF_PROG_TYPE_TRACING:
+		return prog->expected_attach_type != BPF_TRACE_ITER;
+	case BPF_PROG_TYPE_STRUCT_OPS:
+	case BPF_PROG_TYPE_LSM:
+		return false;
+	default:
+		return true;
+	}
+}
+
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7b373a5e861f..a0b4266196a8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5133,13 +5133,14 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
 
 		run_ctx.bpf_cookie = 0;
 		run_ctx.saved_run_ctx = NULL;
-		if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) {
+		if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
 			/* recursion detected */
 			bpf_prog_put(prog);
 			return -EBUSY;
 		}
 		attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
-		__bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx);
+		__bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
+						&run_ctx);
 		bpf_prog_put(prog);
 		return 0;
 #endif
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index bf0906e1e2b9..d6395215b849 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -864,7 +864,7 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
  * [2..MAX_U64] - execute bpf prog and record execution time.
  *     This is start time.
  */
-u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
 	__acquires(RCU)
 {
 	rcu_read_lock();
@@ -901,7 +901,8 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
 	}
 }
 
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx)
+static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
+					  struct bpf_tramp_run_ctx *run_ctx)
 	__releases(RCU)
 {
 	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -912,8 +913,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
 	rcu_read_unlock();
 }
 
-u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
-					struct bpf_tramp_run_ctx *run_ctx)
+static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+					       struct bpf_tramp_run_ctx *run_ctx)
 	__acquires(RCU)
 {
 	/* Runtime stats are exported via actual BPF_LSM_CGROUP
@@ -927,8 +928,8 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
 	return NO_START_TIME;
 }
 
-void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
-					struct bpf_tramp_run_ctx *run_ctx)
+static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+					       struct bpf_tramp_run_ctx *run_ctx)
 	__releases(RCU)
 {
 	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -937,7 +938,8 @@ void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
 	rcu_read_unlock();
 }
 
-u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
+					     struct bpf_tramp_run_ctx *run_ctx)
 {
 	rcu_read_lock_trace();
 	migrate_disable();
@@ -953,8 +955,8 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r
 	return bpf_prog_start_time();
 }
 
-void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
-				       struct bpf_tramp_run_ctx *run_ctx)
+void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
+					     struct bpf_tramp_run_ctx *run_ctx)
 {
 	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
 
@@ -964,8 +966,30 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
 	rcu_read_unlock_trace();
 }
 
-u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
-					struct bpf_tramp_run_ctx *run_ctx)
+static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog,
+					      struct bpf_tramp_run_ctx *run_ctx)
+{
+	rcu_read_lock_trace();
+	migrate_disable();
+	might_fault();
+
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+	return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
+					      struct bpf_tramp_run_ctx *run_ctx)
+{
+	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+	update_prog_stats(prog, start);
+	migrate_enable();
+	rcu_read_unlock_trace();
+}
+
+static u64 notrace __bpf_prog_enter(struct bpf_prog *prog,
+				    struct bpf_tramp_run_ctx *run_ctx)
 	__acquires(RCU)
 {
 	rcu_read_lock();
@@ -976,8 +1000,8 @@ u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
 	return bpf_prog_start_time();
 }
 
-void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
-					struct bpf_tramp_run_ctx *run_ctx)
+static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start,
+				    struct bpf_tramp_run_ctx *run_ctx)
 	__releases(RCU)
 {
 	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -997,6 +1021,36 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
 	percpu_ref_put(&tr->pcref);
 }
 
+bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
+{
+	bool sleepable = prog->aux->sleepable;
+
+	if (bpf_prog_check_recur(prog))
+		return sleepable ? __bpf_prog_enter_sleepable_recur :
+			__bpf_prog_enter_recur;
+
+	if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+	    prog->expected_attach_type == BPF_LSM_CGROUP)
+		return __bpf_prog_enter_lsm_cgroup;
+
+	return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter;
+}
+
+bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
+{
+	bool sleepable = prog->aux->sleepable;
+
+	if (bpf_prog_check_recur(prog))
+		return sleepable ? __bpf_prog_exit_sleepable_recur :
+			__bpf_prog_exit_recur;
+
+	if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+	    prog->expected_attach_type == BPF_LSM_CGROUP)
+		return __bpf_prog_exit_lsm_cgroup;
+
+	return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit;
+}
+
 int __weak
 arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
 			    const struct btf_func_model *m, u32 flags,
-- 
cgit v1.2.3


From 0593dd34e53489557569d5e6d27371b49aa9b41f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 25 Oct 2022 11:45:17 -0700
Subject: bpf: Append _recur naming to the bpf_task_storage helper proto

This patch adds the "_recur" naming to the bpf_task_storage_{get,delete}
proto.  In a latter patch, they will only be used by the tracing
programs that requires a deadlock detection because a tracing
prog may use bpf_task_storage_{get,delete} recursively and cause a
deadlock.

Another following patch will add a different helper proto for the non
tracing programs because they do not need the deadlock prevention.
This patch does this rename to prepare for this future proto
additions.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20221025184524.3526117-3-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           |  4 ++--
 kernel/bpf/bpf_task_storage.c | 12 ++++++------
 kernel/trace/bpf_trace.c      |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1279e699dc98..b04fe3f4342e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2519,8 +2519,8 @@ extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
 extern const struct bpf_func_proto bpf_sock_from_file_proto;
 extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
-extern const struct bpf_func_proto bpf_task_storage_get_proto;
-extern const struct bpf_func_proto bpf_task_storage_delete_proto;
+extern const struct bpf_func_proto bpf_task_storage_get_recur_proto;
+extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto;
 extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 6f290623347e..bce50ae03f42 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -228,7 +228,7 @@ out:
 }
 
 /* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *,
 	   task, void *, value, u64, flags, gfp_t, gfp_flags)
 {
 	struct bpf_local_storage_data *sdata;
@@ -260,7 +260,7 @@ unlock:
 		(unsigned long)sdata->data;
 }
 
-BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
 	   task)
 {
 	int ret;
@@ -322,8 +322,8 @@ const struct bpf_map_ops task_storage_map_ops = {
 	.map_owner_storage_ptr = task_storage_ptr,
 };
 
-const struct bpf_func_proto bpf_task_storage_get_proto = {
-	.func = bpf_task_storage_get,
+const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
+	.func = bpf_task_storage_get_recur,
 	.gpl_only = false,
 	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type = ARG_CONST_MAP_PTR,
@@ -333,8 +333,8 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
 	.arg4_type = ARG_ANYTHING,
 };
 
-const struct bpf_func_proto bpf_task_storage_delete_proto = {
-	.func = bpf_task_storage_delete,
+const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
+	.func = bpf_task_storage_delete_recur,
 	.gpl_only = false,
 	.ret_type = RET_INTEGER,
 	.arg1_type = ARG_CONST_MAP_PTR,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2460e38f75ff..83b9b9afe235 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1488,9 +1488,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_this_cpu_ptr:
 		return &bpf_this_cpu_ptr_proto;
 	case BPF_FUNC_task_storage_get:
-		return &bpf_task_storage_get_proto;
+		return &bpf_task_storage_get_recur_proto;
 	case BPF_FUNC_task_storage_delete:
-		return &bpf_task_storage_delete_proto;
+		return &bpf_task_storage_delete_recur_proto;
 	case BPF_FUNC_for_each_map_elem:
 		return &bpf_for_each_map_elem_proto;
 	case BPF_FUNC_snprintf:
-- 
cgit v1.2.3


From 4279adb094a17132423f1271c3d11b593fc2327e Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 25 Oct 2022 11:45:20 -0700
Subject: bpf: Add new bpf_task_storage_get proto with no deadlock detection

The bpf_lsm and bpf_iter do not recur that will cause a deadlock.
The situation is similar to the bpf_pid_task_storage_lookup_elem()
which is called from the syscall map_lookup_elem.  It does not need
deadlock detection.  Otherwise, it will cause unnecessary failure
when calling the bpf_task_storage_get() helper.

This patch adds bpf_task_storage_get proto that does not do deadlock
detection.  It will be used by bpf_lsm and bpf_iter programs.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20221025184524.3526117-6-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           |  1 +
 kernel/bpf/bpf_task_storage.c | 28 ++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c      |  5 ++++-
 3 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b04fe3f4342e..ef3f98afa45d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2520,6 +2520,7 @@ extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
 extern const struct bpf_func_proto bpf_sock_from_file_proto;
 extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
 extern const struct bpf_func_proto bpf_task_storage_get_recur_proto;
+extern const struct bpf_func_proto bpf_task_storage_get_proto;
 extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto;
 extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index bc52bc8b59f7..c3a841be438f 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -269,6 +269,23 @@ BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct
 	return (unsigned long)data;
 }
 
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+	   task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+	void *data;
+
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
+	if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+		return (unsigned long)NULL;
+
+	bpf_task_storage_lock();
+	data = __bpf_task_storage_get(map, task, value, flags,
+				      gfp_flags, true);
+	bpf_task_storage_unlock();
+	return (unsigned long)data;
+}
+
 BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
 	   task)
 {
@@ -342,6 +359,17 @@ const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
 	.arg4_type = ARG_ANYTHING,
 };
 
+const struct bpf_func_proto bpf_task_storage_get_proto = {
+	.func = bpf_task_storage_get,
+	.gpl_only = false,
+	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_BTF_ID,
+	.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+	.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg4_type = ARG_ANYTHING,
+};
+
 const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
 	.func = bpf_task_storage_delete_recur,
 	.gpl_only = false,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 83b9b9afe235..e9759b0f7199 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -6,6 +6,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
 #include <linux/bpf_perf_event.h>
 #include <linux/btf.h>
 #include <linux/filter.h>
@@ -1488,7 +1489,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_this_cpu_ptr:
 		return &bpf_this_cpu_ptr_proto;
 	case BPF_FUNC_task_storage_get:
-		return &bpf_task_storage_get_recur_proto;
+		if (bpf_prog_check_recur(prog))
+			return &bpf_task_storage_get_recur_proto;
+		return &bpf_task_storage_get_proto;
 	case BPF_FUNC_task_storage_delete:
 		return &bpf_task_storage_delete_recur_proto;
 	case BPF_FUNC_for_each_map_elem:
-- 
cgit v1.2.3


From 8a7dac37f27a3dfbd814bf29a73d6417db2c81d9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 25 Oct 2022 11:45:22 -0700
Subject: bpf: Add new bpf_task_storage_delete proto with no deadlock detection

The bpf_lsm and bpf_iter do not recur that will cause a deadlock.
The situation is similar to the bpf_pid_task_storage_delete_elem()
which is called from the syscall map_delete_elem.  It does not need
deadlock detection.  Otherwise, it will cause unnecessary failure
when calling the bpf_task_storage_delete() helper.

This patch adds bpf_task_storage_delete proto that does not do deadlock
detection.  It will be used by bpf_lsm and bpf_iter program.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20221025184524.3526117-8-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           |  1 +
 kernel/bpf/bpf_task_storage.c | 28 ++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c      |  4 +++-
 3 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ef3f98afa45d..a5dbac8f5aba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2522,6 +2522,7 @@ extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
 extern const struct bpf_func_proto bpf_task_storage_get_recur_proto;
 extern const struct bpf_func_proto bpf_task_storage_get_proto;
 extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto;
+extern const struct bpf_func_proto bpf_task_storage_delete_proto;
 extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index f3f79b618a68..ba3fe72d1fa5 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -311,6 +311,25 @@ BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_str
 	return ret;
 }
 
+BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+	   task)
+{
+	int ret;
+
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
+	if (!task)
+		return -EINVAL;
+
+	bpf_task_storage_lock();
+	/* This helper must only be called from places where the lifetime of the task
+	 * is guaranteed. Either by being refcounted or by being protected
+	 * by an RCU read-side critical section.
+	 */
+	ret = task_storage_delete(task, map, true);
+	bpf_task_storage_unlock();
+	return ret;
+}
+
 static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
 	return -ENOTSUPP;
@@ -382,3 +401,12 @@ const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
 	.arg2_type = ARG_PTR_TO_BTF_ID,
 	.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
 };
+
+const struct bpf_func_proto bpf_task_storage_delete_proto = {
+	.func = bpf_task_storage_delete,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_BTF_ID,
+	.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e9759b0f7199..eed1bd952c3a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1493,7 +1493,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 			return &bpf_task_storage_get_recur_proto;
 		return &bpf_task_storage_get_proto;
 	case BPF_FUNC_task_storage_delete:
-		return &bpf_task_storage_delete_recur_proto;
+		if (bpf_prog_check_recur(prog))
+			return &bpf_task_storage_delete_recur_proto;
+		return &bpf_task_storage_delete_proto;
 	case BPF_FUNC_for_each_map_elem:
 		return &bpf_for_each_map_elem_proto;
 	case BPF_FUNC_snprintf:
-- 
cgit v1.2.3


From 5e67b8ef125bb6e83bf0f0442ad7ffc09e7956f9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 25 Oct 2022 21:28:40 -0700
Subject: bpf: Make struct cgroup btf id global

Make struct cgroup btf id global so later patch can reuse
the same btf id.

Acked-by: David Vernet <void@manifault.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20221026042840.672602-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h  | 1 +
 kernel/bpf/cgroup_iter.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 2aea877d644f..c9744efd202f 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -265,5 +265,6 @@ MAX_BTF_TRACING_TYPE,
 };
 
 extern u32 btf_tracing_ids[];
+extern u32 bpf_cgroup_btf_id[];
 
 #endif
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 0d200a993489..c6ffc706d583 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -157,7 +157,7 @@ static const struct seq_operations cgroup_iter_seq_ops = {
 	.show   = cgroup_iter_seq_show,
 };
 
-BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
 
 static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
 {
-- 
cgit v1.2.3


From c83597fa5dc6b322e9bdf929e5f4136a3f4aa4db Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 25 Oct 2022 21:28:45 -0700
Subject: bpf: Refactor some inode/task/sk storage functions for reuse

Refactor codes so that inode/task/sk storage implementation
can maximally share the same code. I also added some comments
in new function bpf_local_storage_unlink_nolock() to make
codes easy to understand. There is no functionality change.

Acked-by: David Vernet <void@manifault.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20221026042845.672944-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  17 ++--
 kernel/bpf/bpf_inode_storage.c    |  38 +-------
 kernel/bpf/bpf_local_storage.c    | 190 ++++++++++++++++++++++++--------------
 kernel/bpf/bpf_task_storage.c     |  38 +-------
 net/core/bpf_sk_storage.c         |  35 +------
 5 files changed, 137 insertions(+), 181 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 7ea18d4da84b..6d37a40cd90e 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -116,21 +116,22 @@ static struct bpf_local_storage_cache name = {			\
 	.idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock),	\
 }
 
-u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache);
-void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
-				      u16 idx);
-
 /* Helper functions for bpf_local_storage */
 int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
 
-struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr);
+struct bpf_map *
+bpf_local_storage_map_alloc(union bpf_attr *attr,
+			    struct bpf_local_storage_cache *cache);
 
 struct bpf_local_storage_data *
 bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
 			 struct bpf_local_storage_map *smap,
 			 bool cacheit_lockit);
 
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
+bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage);
+
+void bpf_local_storage_map_free(struct bpf_map *map,
+				struct bpf_local_storage_cache *cache,
 				int __percpu *busy_counter);
 
 int bpf_local_storage_map_check_btf(const struct bpf_map *map,
@@ -141,10 +142,6 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
 				   struct bpf_local_storage_elem *selem);
 
-bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
-				     struct bpf_local_storage_elem *selem,
-				     bool uncharge_omem, bool use_trace_rcu);
-
 void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu);
 
 void bpf_selem_link_map(struct bpf_local_storage_map *smap,
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 5f7683b19199..6a1d4d22816a 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -56,11 +56,9 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
 
 void bpf_inode_storage_free(struct inode *inode)
 {
-	struct bpf_local_storage_elem *selem;
 	struct bpf_local_storage *local_storage;
 	bool free_inode_storage = false;
 	struct bpf_storage_blob *bsb;
-	struct hlist_node *n;
 
 	bsb = bpf_inode(inode);
 	if (!bsb)
@@ -74,30 +72,11 @@ void bpf_inode_storage_free(struct inode *inode)
 		return;
 	}
 
-	/* Neither the bpf_prog nor the bpf-map's syscall
-	 * could be modifying the local_storage->list now.
-	 * Thus, no elem can be added-to or deleted-from the
-	 * local_storage->list by the bpf_prog or by the bpf-map's syscall.
-	 *
-	 * It is racing with bpf_local_storage_map_free() alone
-	 * when unlinking elem from the local_storage->list and
-	 * the map's bucket->list.
-	 */
 	raw_spin_lock_bh(&local_storage->lock);
-	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
-		/* Always unlink from map before unlinking from
-		 * local_storage.
-		 */
-		bpf_selem_unlink_map(selem);
-		free_inode_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, false, false);
-	}
+	free_inode_storage = bpf_local_storage_unlink_nolock(local_storage);
 	raw_spin_unlock_bh(&local_storage->lock);
 	rcu_read_unlock();
 
-	/* free_inoode_storage should always be true as long as
-	 * local_storage->list was non-empty.
-	 */
 	if (free_inode_storage)
 		kfree_rcu(local_storage, rcu);
 }
@@ -226,23 +205,12 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
 
 static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
 {
-	struct bpf_local_storage_map *smap;
-
-	smap = bpf_local_storage_map_alloc(attr);
-	if (IS_ERR(smap))
-		return ERR_CAST(smap);
-
-	smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
-	return &smap->map;
+	return bpf_local_storage_map_alloc(attr, &inode_cache);
 }
 
 static void inode_storage_map_free(struct bpf_map *map)
 {
-	struct bpf_local_storage_map *smap;
-
-	smap = (struct bpf_local_storage_map *)map;
-	bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
-	bpf_local_storage_map_free(smap, NULL);
+	bpf_local_storage_map_free(map, &inode_cache, NULL);
 }
 
 BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct,
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 781d14167140..93d9b1b17bc8 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -113,9 +113,9 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
  * The caller must ensure selem->smap is still valid to be
  * dereferenced for its smap->elem_size and smap->cache_idx.
  */
-bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
-				     struct bpf_local_storage_elem *selem,
-				     bool uncharge_mem, bool use_trace_rcu)
+static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
+					    struct bpf_local_storage_elem *selem,
+					    bool uncharge_mem, bool use_trace_rcu)
 {
 	struct bpf_local_storage_map *smap;
 	bool free_local_storage;
@@ -501,7 +501,7 @@ unlock_err:
 	return ERR_PTR(err);
 }
 
-u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
+static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
 {
 	u64 min_usage = U64_MAX;
 	u16 i, res = 0;
@@ -525,76 +525,14 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
 	return res;
 }
 
-void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
-				      u16 idx)
+static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+					     u16 idx)
 {
 	spin_lock(&cache->idx_lock);
 	cache->idx_usage_counts[idx]--;
 	spin_unlock(&cache->idx_lock);
 }
 
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
-				int __percpu *busy_counter)
-{
-	struct bpf_local_storage_elem *selem;
-	struct bpf_local_storage_map_bucket *b;
-	unsigned int i;
-
-	/* Note that this map might be concurrently cloned from
-	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
-	 * RCU read section to finish before proceeding. New RCU
-	 * read sections should be prevented via bpf_map_inc_not_zero.
-	 */
-	synchronize_rcu();
-
-	/* bpf prog and the userspace can no longer access this map
-	 * now.  No new selem (of this map) can be added
-	 * to the owner->storage or to the map bucket's list.
-	 *
-	 * The elem of this map can be cleaned up here
-	 * or when the storage is freed e.g.
-	 * by bpf_sk_storage_free() during __sk_destruct().
-	 */
-	for (i = 0; i < (1U << smap->bucket_log); i++) {
-		b = &smap->buckets[i];
-
-		rcu_read_lock();
-		/* No one is adding to b->list now */
-		while ((selem = hlist_entry_safe(
-				rcu_dereference_raw(hlist_first_rcu(&b->list)),
-				struct bpf_local_storage_elem, map_node))) {
-			if (busy_counter) {
-				migrate_disable();
-				this_cpu_inc(*busy_counter);
-			}
-			bpf_selem_unlink(selem, false);
-			if (busy_counter) {
-				this_cpu_dec(*busy_counter);
-				migrate_enable();
-			}
-			cond_resched_rcu();
-		}
-		rcu_read_unlock();
-	}
-
-	/* While freeing the storage we may still need to access the map.
-	 *
-	 * e.g. when bpf_sk_storage_free() has unlinked selem from the map
-	 * which then made the above while((selem = ...)) loop
-	 * exit immediately.
-	 *
-	 * However, while freeing the storage one still needs to access the
-	 * smap->elem_size to do the uncharging in
-	 * bpf_selem_unlink_storage_nolock().
-	 *
-	 * Hence, wait another rcu grace period for the storage to be freed.
-	 */
-	synchronize_rcu();
-
-	kvfree(smap->buckets);
-	bpf_map_area_free(smap);
-}
-
 int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
@@ -614,7 +552,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
 	return 0;
 }
 
-struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
+static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_local_storage_map *smap;
 	unsigned int i;
@@ -664,3 +602,117 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 
 	return 0;
 }
+
+bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
+{
+	struct bpf_local_storage_elem *selem;
+	bool free_storage = false;
+	struct hlist_node *n;
+
+	/* Neither the bpf_prog nor the bpf_map's syscall
+	 * could be modifying the local_storage->list now.
+	 * Thus, no elem can be added to or deleted from the
+	 * local_storage->list by the bpf_prog or by the bpf_map's syscall.
+	 *
+	 * It is racing with bpf_local_storage_map_free() alone
+	 * when unlinking elem from the local_storage->list and
+	 * the map's bucket->list.
+	 */
+	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+		/* Always unlink from map before unlinking from
+		 * local_storage.
+		 */
+		bpf_selem_unlink_map(selem);
+		/* If local_storage list has only one element, the
+		 * bpf_selem_unlink_storage_nolock() will return true.
+		 * Otherwise, it will return false. The current loop iteration
+		 * intends to remove all local storage. So the last iteration
+		 * of the loop will set the free_cgroup_storage to true.
+		 */
+		free_storage = bpf_selem_unlink_storage_nolock(
+			local_storage, selem, false, false);
+	}
+
+	return free_storage;
+}
+
+struct bpf_map *
+bpf_local_storage_map_alloc(union bpf_attr *attr,
+			    struct bpf_local_storage_cache *cache)
+{
+	struct bpf_local_storage_map *smap;
+
+	smap = __bpf_local_storage_map_alloc(attr);
+	if (IS_ERR(smap))
+		return ERR_CAST(smap);
+
+	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
+	return &smap->map;
+}
+
+void bpf_local_storage_map_free(struct bpf_map *map,
+				struct bpf_local_storage_cache *cache,
+				int __percpu *busy_counter)
+{
+	struct bpf_local_storage_map_bucket *b;
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage_map *smap;
+	unsigned int i;
+
+	smap = (struct bpf_local_storage_map *)map;
+	bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
+
+	/* Note that this map might be concurrently cloned from
+	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+	 * RCU read section to finish before proceeding. New RCU
+	 * read sections should be prevented via bpf_map_inc_not_zero.
+	 */
+	synchronize_rcu();
+
+	/* bpf prog and the userspace can no longer access this map
+	 * now.  No new selem (of this map) can be added
+	 * to the owner->storage or to the map bucket's list.
+	 *
+	 * The elem of this map can be cleaned up here
+	 * or when the storage is freed e.g.
+	 * by bpf_sk_storage_free() during __sk_destruct().
+	 */
+	for (i = 0; i < (1U << smap->bucket_log); i++) {
+		b = &smap->buckets[i];
+
+		rcu_read_lock();
+		/* No one is adding to b->list now */
+		while ((selem = hlist_entry_safe(
+				rcu_dereference_raw(hlist_first_rcu(&b->list)),
+				struct bpf_local_storage_elem, map_node))) {
+			if (busy_counter) {
+				migrate_disable();
+				this_cpu_inc(*busy_counter);
+			}
+			bpf_selem_unlink(selem, false);
+			if (busy_counter) {
+				this_cpu_dec(*busy_counter);
+				migrate_enable();
+			}
+			cond_resched_rcu();
+		}
+		rcu_read_unlock();
+	}
+
+	/* While freeing the storage we may still need to access the map.
+	 *
+	 * e.g. when bpf_sk_storage_free() has unlinked selem from the map
+	 * which then made the above while((selem = ...)) loop
+	 * exit immediately.
+	 *
+	 * However, while freeing the storage one still needs to access the
+	 * smap->elem_size to do the uncharging in
+	 * bpf_selem_unlink_storage_nolock().
+	 *
+	 * Hence, wait another rcu grace period for the storage to be freed.
+	 */
+	synchronize_rcu();
+
+	kvfree(smap->buckets);
+	bpf_map_area_free(smap);
+}
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index ba3fe72d1fa5..8e832db8151a 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -71,10 +71,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
 
 void bpf_task_storage_free(struct task_struct *task)
 {
-	struct bpf_local_storage_elem *selem;
 	struct bpf_local_storage *local_storage;
 	bool free_task_storage = false;
-	struct hlist_node *n;
 	unsigned long flags;
 
 	rcu_read_lock();
@@ -85,32 +83,13 @@ void bpf_task_storage_free(struct task_struct *task)
 		return;
 	}
 
-	/* Neither the bpf_prog nor the bpf-map's syscall
-	 * could be modifying the local_storage->list now.
-	 * Thus, no elem can be added-to or deleted-from the
-	 * local_storage->list by the bpf_prog or by the bpf-map's syscall.
-	 *
-	 * It is racing with bpf_local_storage_map_free() alone
-	 * when unlinking elem from the local_storage->list and
-	 * the map's bucket->list.
-	 */
 	bpf_task_storage_lock();
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
-	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
-		/* Always unlink from map before unlinking from
-		 * local_storage.
-		 */
-		bpf_selem_unlink_map(selem);
-		free_task_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, false, false);
-	}
+	free_task_storage = bpf_local_storage_unlink_nolock(local_storage);
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 	bpf_task_storage_unlock();
 	rcu_read_unlock();
 
-	/* free_task_storage should always be true as long as
-	 * local_storage->list was non-empty.
-	 */
 	if (free_task_storage)
 		kfree_rcu(local_storage, rcu);
 }
@@ -337,23 +316,12 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
 {
-	struct bpf_local_storage_map *smap;
-
-	smap = bpf_local_storage_map_alloc(attr);
-	if (IS_ERR(smap))
-		return ERR_CAST(smap);
-
-	smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
-	return &smap->map;
+	return bpf_local_storage_map_alloc(attr, &task_cache);
 }
 
 static void task_storage_map_free(struct bpf_map *map)
 {
-	struct bpf_local_storage_map *smap;
-
-	smap = (struct bpf_local_storage_map *)map;
-	bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
-	bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
+	bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
 }
 
 BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 94374d529ea4..49884e7de080 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -48,10 +48,8 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
 /* Called by __sk_destruct() & bpf_sk_storage_clone() */
 void bpf_sk_storage_free(struct sock *sk)
 {
-	struct bpf_local_storage_elem *selem;
 	struct bpf_local_storage *sk_storage;
 	bool free_sk_storage = false;
-	struct hlist_node *n;
 
 	rcu_read_lock();
 	sk_storage = rcu_dereference(sk->sk_bpf_storage);
@@ -60,24 +58,8 @@ void bpf_sk_storage_free(struct sock *sk)
 		return;
 	}
 
-	/* Netiher the bpf_prog nor the bpf-map's syscall
-	 * could be modifying the sk_storage->list now.
-	 * Thus, no elem can be added-to or deleted-from the
-	 * sk_storage->list by the bpf_prog or by the bpf-map's syscall.
-	 *
-	 * It is racing with bpf_local_storage_map_free() alone
-	 * when unlinking elem from the sk_storage->list and
-	 * the map's bucket->list.
-	 */
 	raw_spin_lock_bh(&sk_storage->lock);
-	hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) {
-		/* Always unlink from map before unlinking from
-		 * sk_storage.
-		 */
-		bpf_selem_unlink_map(selem);
-		free_sk_storage = bpf_selem_unlink_storage_nolock(
-			sk_storage, selem, true, false);
-	}
+	free_sk_storage = bpf_local_storage_unlink_nolock(sk_storage);
 	raw_spin_unlock_bh(&sk_storage->lock);
 	rcu_read_unlock();
 
@@ -87,23 +69,12 @@ void bpf_sk_storage_free(struct sock *sk)
 
 static void bpf_sk_storage_map_free(struct bpf_map *map)
 {
-	struct bpf_local_storage_map *smap;
-
-	smap = (struct bpf_local_storage_map *)map;
-	bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
-	bpf_local_storage_map_free(smap, NULL);
+	bpf_local_storage_map_free(map, &sk_cache, NULL);
 }
 
 static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 {
-	struct bpf_local_storage_map *smap;
-
-	smap = bpf_local_storage_map_alloc(attr);
-	if (IS_ERR(smap))
-		return ERR_CAST(smap);
-
-	smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache);
-	return &smap->map;
+	return bpf_local_storage_map_alloc(attr, &sk_cache);
 }
 
 static int notsupp_get_next_key(struct bpf_map *map, void *key,
-- 
cgit v1.2.3


From c4bcfb38a95edb1021a53f2d0356a78120ecfbe4 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 25 Oct 2022 21:28:50 -0700
Subject: bpf: Implement cgroup storage available to non-cgroup-attached bpf
 progs

Similar to sk/inode/task storage, implement similar cgroup local storage.

There already exists a local storage implementation for cgroup-attached
bpf programs.  See map type BPF_MAP_TYPE_CGROUP_STORAGE and helper
bpf_get_local_storage(). But there are use cases such that non-cgroup
attached bpf progs wants to access cgroup local storage data. For example,
tc egress prog has access to sk and cgroup. It is possible to use
sk local storage to emulate cgroup local storage by storing data in socket.
But this is a waste as it could be lots of sockets belonging to a particular
cgroup. Alternatively, a separate map can be created with cgroup id as the key.
But this will introduce additional overhead to manipulate the new map.
A cgroup local storage, similar to existing sk/inode/task storage,
should help for this use case.

The life-cycle of storage is managed with the life-cycle of the
cgroup struct.  i.e. the storage is destroyed along with the owning cgroup
with a call to bpf_cgrp_storage_free() when cgroup itself
is deleted.

The userspace map operations can be done by using a cgroup fd as a key
passed to the lookup, update and delete operations.

Typically, the following code is used to get the current cgroup:
    struct task_struct *task = bpf_get_current_task_btf();
    ... task->cgroups->dfl_cgrp ...
and in structure task_struct definition:
    struct task_struct {
        ....
        struct css_set __rcu            *cgroups;
        ....
    }
With sleepable program, accessing task->cgroups is not protected by rcu_read_lock.
So the current implementation only supports non-sleepable program and supporting
sleepable program will be the next step together with adding rcu_read_lock
protection for rcu tagged structures.

Since map name BPF_MAP_TYPE_CGROUP_STORAGE has been used for old cgroup local
storage support, the new map name BPF_MAP_TYPE_CGRP_STORAGE is used
for cgroup storage available to non-cgroup-attached bpf programs. The old
cgroup storage supports bpf_get_local_storage() helper to get the cgroup data.
The new cgroup storage helper bpf_cgrp_storage_get() can provide similar
functionality. While old cgroup storage pre-allocates storage memory, the new
mechanism can also pre-allocate with a user space bpf_map_update_elem() call
to avoid potential run-time memory allocation failure.
Therefore, the new cgroup storage can provide all functionality w.r.t.
the old one. So in uapi bpf.h, the old BPF_MAP_TYPE_CGROUP_STORAGE is alias to
BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED to indicate the old cgroup storage can
be deprecated since the new one can provide the same functionality.

Acked-by: David Vernet <void@manifault.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20221026042850.673791-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |   7 ++
 include/linux/bpf_types.h      |   1 +
 include/linux/cgroup-defs.h    |   4 +
 include/uapi/linux/bpf.h       |  50 ++++++++-
 kernel/bpf/Makefile            |   2 +-
 kernel/bpf/bpf_cgrp_storage.c  | 247 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/helpers.c           |   6 +
 kernel/bpf/syscall.c           |   3 +-
 kernel/bpf/verifier.c          |  13 ++-
 kernel/cgroup/cgroup.c         |   1 +
 kernel/trace/bpf_trace.c       |   4 +
 scripts/bpf_doc.py             |   2 +
 tools/include/uapi/linux/bpf.h |  50 ++++++++-
 13 files changed, 385 insertions(+), 5 deletions(-)
 create mode 100644 kernel/bpf/bpf_cgrp_storage.c

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a5dbac8f5aba..9fd68b0b3e9c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2041,6 +2041,7 @@ struct bpf_link *bpf_link_by_id(u32 id);
 
 const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
 void bpf_task_storage_free(struct task_struct *task);
+void bpf_cgrp_storage_free(struct cgroup *cgroup);
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
 const struct btf_func_model *
 bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
@@ -2295,6 +2296,10 @@ static inline bool has_current_bpf_ctx(void)
 static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog)
 {
 }
+
+static inline void bpf_cgrp_storage_free(struct cgroup *cgroup)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
@@ -2535,6 +2540,8 @@ extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
 extern const struct bpf_func_proto bpf_set_retval_proto;
 extern const struct bpf_func_proto bpf_get_retval_proto;
 extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto;
+extern const struct bpf_func_proto bpf_cgrp_storage_get_proto;
+extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2c6a4f2562a7..d4ee3ccd3753 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -86,6 +86,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops)
 #ifdef CONFIG_CGROUPS
 BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_CGRP_STORAGE, cgrp_storage_map_ops)
 #endif
 #ifdef CONFIG_CGROUP_BPF
 BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8f481d1b159a..c466fdc3a32a 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -504,6 +504,10 @@ struct cgroup {
 	/* Used to store internal freezer state */
 	struct cgroup_freezer_state freezer;
 
+#ifdef CONFIG_BPF_SYSCALL
+	struct bpf_local_storage __rcu  *bpf_cgrp_storage;
+#endif
+
 	/* All ancestors including self */
 	struct cgroup *ancestors[];
 };
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 17f61338f8f8..94659f6b3395 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -922,7 +922,14 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CPUMAP,
 	BPF_MAP_TYPE_XSKMAP,
 	BPF_MAP_TYPE_SOCKHASH,
-	BPF_MAP_TYPE_CGROUP_STORAGE,
+	BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED,
+	/* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching
+	 * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to
+	 * both cgroup-attached and other progs and supports all functionality
+	 * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark
+	 * BPF_MAP_TYPE_CGROUP_STORAGE deprecated.
+	 */
+	BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED,
 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
 	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
 	BPF_MAP_TYPE_QUEUE,
@@ -935,6 +942,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_TASK_STORAGE,
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
+	BPF_MAP_TYPE_CGRP_STORAGE,
 };
 
 /* Note that tracing related programs such as
@@ -5435,6 +5443,44 @@ union bpf_attr {
  *		**-E2BIG** if user-space has tried to publish a sample which is
  *		larger than the size of the ring buffer, or which cannot fit
  *		within a struct bpf_dynptr.
+ *
+ * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags)
+ *	Description
+ *		Get a bpf_local_storage from the *cgroup*.
+ *
+ *		Logically, it could be thought of as getting the value from
+ *		a *map* with *cgroup* as the **key**.  From this
+ *		perspective,  the usage is not much different from
+ *		**bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this
+ *		helper enforces the key must be a cgroup struct and the map must also
+ *		be a **BPF_MAP_TYPE_CGRP_STORAGE**.
+ *
+ *		In reality, the local-storage value is embedded directly inside of the
+ *		*cgroup* object itself, rather than being located in the
+ *		**BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is
+ *		queried for some *map* on a *cgroup* object, the kernel will perform an
+ *		O(n) iteration over all of the live local-storage values for that
+ *		*cgroup* object until the local-storage value for the *map* is found.
+ *
+ *		An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ *		used such that a new bpf_local_storage will be
+ *		created if one does not exist.  *value* can be used
+ *		together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ *		the initial value of a bpf_local_storage.  If *value* is
+ *		**NULL**, the new bpf_local_storage will be zero initialized.
+ *	Return
+ *		A bpf_local_storage pointer is returned on success.
+ *
+ *		**NULL** if not found or there was an error in adding
+ *		a new bpf_local_storage.
+ *
+ * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup)
+ *	Description
+ *		Delete a bpf_local_storage from a *cgroup*.
+ *	Return
+ *		0 on success.
+ *
+ *		**-ENOENT** if the bpf_local_storage cannot be found.
  */
 #define ___BPF_FUNC_MAPPER(FN, ctx...)			\
 	FN(unspec, 0, ##ctx)				\
@@ -5647,6 +5693,8 @@ union bpf_attr {
 	FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx)	\
 	FN(ktime_get_tai_ns, 208, ##ctx)		\
 	FN(user_ringbuf_drain, 209, ##ctx)		\
+	FN(cgrp_storage_get, 210, ##ctx)		\
+	FN(cgrp_storage_delete, 211, ##ctx)		\
 	/* */
 
 /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 341c94f208f4..3a12e6b400a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -25,7 +25,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
 ifeq ($(CONFIG_CGROUPS),y)
-obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o bpf_cgrp_storage.o
 endif
 obj-$(CONFIG_CGROUP_BPF) += cgroup.o
 ifeq ($(CONFIG_INET),y)
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
new file mode 100644
index 000000000000..309403800f82
--- /dev/null
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+
+DEFINE_BPF_STORAGE_CACHE(cgroup_cache);
+
+static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
+
+static void bpf_cgrp_storage_lock(void)
+{
+	migrate_disable();
+	this_cpu_inc(bpf_cgrp_storage_busy);
+}
+
+static void bpf_cgrp_storage_unlock(void)
+{
+	this_cpu_dec(bpf_cgrp_storage_busy);
+	migrate_enable();
+}
+
+static bool bpf_cgrp_storage_trylock(void)
+{
+	migrate_disable();
+	if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
+		this_cpu_dec(bpf_cgrp_storage_busy);
+		migrate_enable();
+		return false;
+	}
+	return true;
+}
+
+static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
+{
+	struct cgroup *cg = owner;
+
+	return &cg->bpf_cgrp_storage;
+}
+
+void bpf_cgrp_storage_free(struct cgroup *cgroup)
+{
+	struct bpf_local_storage *local_storage;
+	bool free_cgroup_storage = false;
+	unsigned long flags;
+
+	rcu_read_lock();
+	local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
+	if (!local_storage) {
+		rcu_read_unlock();
+		return;
+	}
+
+	bpf_cgrp_storage_lock();
+	raw_spin_lock_irqsave(&local_storage->lock, flags);
+	free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage);
+	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	bpf_cgrp_storage_unlock();
+	rcu_read_unlock();
+
+	if (free_cgroup_storage)
+		kfree_rcu(local_storage, rcu);
+}
+
+static struct bpf_local_storage_data *
+cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit)
+{
+	struct bpf_local_storage *cgroup_storage;
+	struct bpf_local_storage_map *smap;
+
+	cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage,
+					       bpf_rcu_lock_held());
+	if (!cgroup_storage)
+		return NULL;
+
+	smap = (struct bpf_local_storage_map *)map;
+	return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit);
+}
+
+static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_local_storage_data *sdata;
+	struct cgroup *cgroup;
+	int fd;
+
+	fd = *(int *)key;
+	cgroup = cgroup_get_from_fd(fd);
+	if (IS_ERR(cgroup))
+		return ERR_CAST(cgroup);
+
+	bpf_cgrp_storage_lock();
+	sdata = cgroup_storage_lookup(cgroup, map, true);
+	bpf_cgrp_storage_unlock();
+	cgroup_put(cgroup);
+	return sdata ? sdata->data : NULL;
+}
+
+static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+					  void *value, u64 map_flags)
+{
+	struct bpf_local_storage_data *sdata;
+	struct cgroup *cgroup;
+	int fd;
+
+	fd = *(int *)key;
+	cgroup = cgroup_get_from_fd(fd);
+	if (IS_ERR(cgroup))
+		return PTR_ERR(cgroup);
+
+	bpf_cgrp_storage_lock();
+	sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+					 value, map_flags, GFP_ATOMIC);
+	bpf_cgrp_storage_unlock();
+	cgroup_put(cgroup);
+	return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
+{
+	struct bpf_local_storage_data *sdata;
+
+	sdata = cgroup_storage_lookup(cgroup, map, false);
+	if (!sdata)
+		return -ENOENT;
+
+	bpf_selem_unlink(SELEM(sdata), true);
+	return 0;
+}
+
+static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+{
+	struct cgroup *cgroup;
+	int err, fd;
+
+	fd = *(int *)key;
+	cgroup = cgroup_get_from_fd(fd);
+	if (IS_ERR(cgroup))
+		return PTR_ERR(cgroup);
+
+	bpf_cgrp_storage_lock();
+	err = cgroup_storage_delete(cgroup, map);
+	bpf_cgrp_storage_unlock();
+	cgroup_put(cgroup);
+	return err;
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	return -ENOTSUPP;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+	return bpf_local_storage_map_alloc(attr, &cgroup_cache);
+}
+
+static void cgroup_storage_map_free(struct bpf_map *map)
+{
+	bpf_local_storage_map_free(map, &cgroup_cache, NULL);
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+	   void *, value, u64, flags, gfp_t, gfp_flags)
+{
+	struct bpf_local_storage_data *sdata;
+
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
+	if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return (unsigned long)NULL;
+
+	if (!cgroup)
+		return (unsigned long)NULL;
+
+	if (!bpf_cgrp_storage_trylock())
+		return (unsigned long)NULL;
+
+	sdata = cgroup_storage_lookup(cgroup, map, true);
+	if (sdata)
+		goto unlock;
+
+	/* only allocate new storage, when the cgroup is refcounted */
+	if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
+	    (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+		sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+						 value, BPF_NOEXIST, gfp_flags);
+
+unlock:
+	bpf_cgrp_storage_unlock();
+	return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
+}
+
+BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup)
+{
+	int ret;
+
+	WARN_ON_ONCE(!bpf_rcu_lock_held());
+	if (!cgroup)
+		return -EINVAL;
+
+	if (!bpf_cgrp_storage_trylock())
+		return -EBUSY;
+
+	ret = cgroup_storage_delete(cgroup, map);
+	bpf_cgrp_storage_unlock();
+	return ret;
+}
+
+BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map)
+const struct bpf_map_ops cgrp_storage_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc_check = bpf_local_storage_map_alloc_check,
+	.map_alloc = cgroup_storage_map_alloc,
+	.map_free = cgroup_storage_map_free,
+	.map_get_next_key = notsupp_get_next_key,
+	.map_lookup_elem = bpf_cgrp_storage_lookup_elem,
+	.map_update_elem = bpf_cgrp_storage_update_elem,
+	.map_delete_elem = bpf_cgrp_storage_delete_elem,
+	.map_check_btf = bpf_local_storage_map_check_btf,
+	.map_btf_id = &cgroup_storage_map_btf_ids[0],
+	.map_owner_storage_ptr = cgroup_storage_ptr,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
+	.func		= bpf_cgrp_storage_get,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_btf_id	= &bpf_cgroup_btf_id[0],
+	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
+	.func		= bpf_cgrp_storage_delete,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_btf_id	= &bpf_cgroup_btf_id[0],
+};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a6b04faed282..124fd199ce5c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1663,6 +1663,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_dynptr_write_proto;
 	case BPF_FUNC_dynptr_data:
 		return &bpf_dynptr_data_proto;
+#ifdef CONFIG_CGROUPS
+	case BPF_FUNC_cgrp_storage_get:
+		return &bpf_cgrp_storage_get_proto;
+	case BPF_FUNC_cgrp_storage_delete:
+		return &bpf_cgrp_storage_delete_proto;
+#endif
 	default:
 		break;
 	}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a0b4266196a8..11df90962101 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1016,7 +1016,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
 		    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
 		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
-		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
+		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+		    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
 			return -ENOTSUPP;
 		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
 		    map->value_size) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ddc1452cf023..8f849a763b79 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6350,6 +6350,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_task_storage_delete)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_CGRP_STORAGE:
+		if (func_id != BPF_FUNC_cgrp_storage_get &&
+		    func_id != BPF_FUNC_cgrp_storage_delete)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_BLOOM_FILTER:
 		if (func_id != BPF_FUNC_map_peek_elem &&
 		    func_id != BPF_FUNC_map_push_elem)
@@ -6462,6 +6467,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
 			goto error;
 		break;
+	case BPF_FUNC_cgrp_storage_get:
+	case BPF_FUNC_cgrp_storage_delete:
+		if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -14139,7 +14149,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 
 		if (insn->imm == BPF_FUNC_task_storage_get ||
 		    insn->imm == BPF_FUNC_sk_storage_get ||
-		    insn->imm == BPF_FUNC_inode_storage_get) {
+		    insn->imm == BPF_FUNC_inode_storage_get ||
+		    insn->imm == BPF_FUNC_cgrp_storage_get) {
 			if (env->prog->aux->sleepable)
 				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
 			else
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 764bdd5fd8d1..12001928511b 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5245,6 +5245,7 @@ static void css_free_rwork_fn(struct work_struct *work)
 		atomic_dec(&cgrp->root->nr_cgrps);
 		cgroup1_pidlist_destroy_all(cgrp);
 		cancel_work_sync(&cgrp->release_agent_work);
+		bpf_cgrp_storage_free(cgrp);
 
 		if (cgroup_parent(cgrp)) {
 			/*
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index eed1bd952c3a..ce0228c72a93 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1455,6 +1455,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_current_cgroup_id_proto;
 	case BPF_FUNC_get_current_ancestor_cgroup_id:
 		return &bpf_get_current_ancestor_cgroup_id_proto;
+	case BPF_FUNC_cgrp_storage_get:
+		return &bpf_cgrp_storage_get_proto;
+	case BPF_FUNC_cgrp_storage_delete:
+		return &bpf_cgrp_storage_delete_proto;
 #endif
 	case BPF_FUNC_send_signal:
 		return &bpf_send_signal_proto;
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index c0e6690be82a..fdb0aff8cb5a 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -685,6 +685,7 @@ class PrinterHelpers(Printer):
             'struct udp6_sock',
             'struct unix_sock',
             'struct task_struct',
+            'struct cgroup',
 
             'struct __sk_buff',
             'struct sk_msg_md',
@@ -742,6 +743,7 @@ class PrinterHelpers(Printer):
             'struct udp6_sock',
             'struct unix_sock',
             'struct task_struct',
+            'struct cgroup',
             'struct path',
             'struct btf_ptr',
             'struct inode',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 17f61338f8f8..94659f6b3395 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -922,7 +922,14 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CPUMAP,
 	BPF_MAP_TYPE_XSKMAP,
 	BPF_MAP_TYPE_SOCKHASH,
-	BPF_MAP_TYPE_CGROUP_STORAGE,
+	BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED,
+	/* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching
+	 * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to
+	 * both cgroup-attached and other progs and supports all functionality
+	 * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark
+	 * BPF_MAP_TYPE_CGROUP_STORAGE deprecated.
+	 */
+	BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED,
 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
 	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
 	BPF_MAP_TYPE_QUEUE,
@@ -935,6 +942,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_TASK_STORAGE,
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
+	BPF_MAP_TYPE_CGRP_STORAGE,
 };
 
 /* Note that tracing related programs such as
@@ -5435,6 +5443,44 @@ union bpf_attr {
  *		**-E2BIG** if user-space has tried to publish a sample which is
  *		larger than the size of the ring buffer, or which cannot fit
  *		within a struct bpf_dynptr.
+ *
+ * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags)
+ *	Description
+ *		Get a bpf_local_storage from the *cgroup*.
+ *
+ *		Logically, it could be thought of as getting the value from
+ *		a *map* with *cgroup* as the **key**.  From this
+ *		perspective,  the usage is not much different from
+ *		**bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this
+ *		helper enforces the key must be a cgroup struct and the map must also
+ *		be a **BPF_MAP_TYPE_CGRP_STORAGE**.
+ *
+ *		In reality, the local-storage value is embedded directly inside of the
+ *		*cgroup* object itself, rather than being located in the
+ *		**BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is
+ *		queried for some *map* on a *cgroup* object, the kernel will perform an
+ *		O(n) iteration over all of the live local-storage values for that
+ *		*cgroup* object until the local-storage value for the *map* is found.
+ *
+ *		An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ *		used such that a new bpf_local_storage will be
+ *		created if one does not exist.  *value* can be used
+ *		together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ *		the initial value of a bpf_local_storage.  If *value* is
+ *		**NULL**, the new bpf_local_storage will be zero initialized.
+ *	Return
+ *		A bpf_local_storage pointer is returned on success.
+ *
+ *		**NULL** if not found or there was an error in adding
+ *		a new bpf_local_storage.
+ *
+ * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup)
+ *	Description
+ *		Delete a bpf_local_storage from a *cgroup*.
+ *	Return
+ *		0 on success.
+ *
+ *		**-ENOENT** if the bpf_local_storage cannot be found.
  */
 #define ___BPF_FUNC_MAPPER(FN, ctx...)			\
 	FN(unspec, 0, ##ctx)				\
@@ -5647,6 +5693,8 @@ union bpf_attr {
 	FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx)	\
 	FN(ktime_get_tai_ns, 208, ##ctx)		\
 	FN(user_ringbuf_drain, 209, ##ctx)		\
+	FN(cgrp_storage_get, 210, ##ctx)		\
+	FN(cgrp_storage_delete, 211, ##ctx)		\
 	/* */
 
 /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
-- 
cgit v1.2.3


From e753df8fbca592d36f539ed950fcdf412166c549 Mon Sep 17 00:00:00 2001
From: Michal Jaron <michalx.jaron@intel.com>
Date: Tue, 25 Oct 2022 09:12:52 -0700
Subject: ice: Add support Flex RXD

Add new VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC flag, opcode
VIRTCHNL_OP_GET_SUPPORTED_RXDIDS and add member rxdid
in struct virtchnl_rxq_info to support AVF Flex RXD
extension.

Add support to allow VF to query flexible descriptor RXDIDs supported
by DDP package and configure Rx queues with selected RXDID for IAVF.

Add code to allow VIRTCHNL_OP_GET_SUPPORTED_RXDIDS message to be
processed. Add necessary macros for registers.

Signed-off-by: Leyi Rong <leyi.rong@intel.com>
Signed-off-by: Xu Ting <ting.xu@intel.com>
Signed-off-by: Michal Jaron <michalx.jaron@intel.com>
Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com>
Tested-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://lore.kernel.org/r/20221025161252.1952939-1-jacob.e.keller@intel.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/intel/ice/ice.h               |  2 +
 drivers/net/ethernet/intel/ice/ice_hw_autogen.h    |  3 +
 drivers/net/ethernet/intel/ice/ice_virtchnl.c      | 86 ++++++++++++++++++++++
 drivers/net/ethernet/intel/ice/ice_virtchnl.h      |  2 +
 .../ethernet/intel/ice/ice_virtchnl_allowlist.c    |  6 ++
 include/linux/avf/virtchnl.h                       | 14 +++-
 6 files changed, 111 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index d75c8d11f867..f88ee051e71c 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -609,6 +609,8 @@ struct ice_pf {
 	u16 num_dmac_chnl_fltrs;
 	struct hlist_head tc_flower_fltr_list;
 
+	u64 supported_rxdids;
+
 	__le64 nvm_phy_type_lo; /* NVM PHY type low */
 	__le64 nvm_phy_type_hi; /* NVM PHY type high */
 	struct ice_link_default_override_tlv link_dflt_override;
diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
index d16738a3d3a7..a92dc9a16035 100644
--- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
+++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
@@ -110,6 +110,9 @@
 #define PRTDCB_TUP2TC				0x001D26C0
 #define GL_PREEXT_L2_PMASK0(_i)			(0x0020F0FC + ((_i) * 4))
 #define GL_PREEXT_L2_PMASK1(_i)			(0x0020F108 + ((_i) * 4))
+#define GLFLXP_RXDID_FLAGS(_i, _j)              (0x0045D000 + ((_i) * 4 + (_j) * 256))
+#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S       0
+#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M       ICE_M(0x3F, 0)
 #define GLFLXP_RXDID_FLX_WRD_0(_i)		(0x0045c800 + ((_i) * 4))
 #define GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_S	0
 #define GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_M	ICE_M(0xFF, 0)
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
index 2b4c791b6cba..c1fa94381f4e 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
@@ -462,6 +462,9 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_REG;
 	}
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC;
+
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_FDIR_PF)
 		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_FDIR_PF;
 
@@ -1618,6 +1621,9 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
 	}
 
 	for (i = 0; i < qci->num_queue_pairs; i++) {
+		struct ice_hw *hw;
+		u32 rxdid;
+		u16 pf_q;
 		qpi = &qci->qpair[i];
 		if (qpi->txq.vsi_id != qci->vsi_id ||
 		    qpi->rxq.vsi_id != qci->vsi_id ||
@@ -1686,6 +1692,25 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
 				goto error_param;
 			}
 		}
+
+		/* VF Rx queue RXDID configuration */
+		pf_q = vsi->rxq_map[qpi->rxq.queue_id];
+		rxdid = qpi->rxq.rxdid;
+		hw = &vsi->back->hw;
+
+		/* If Rx flex desc is supported, select RXDID for Rx queues.
+		 * Otherwise, use legacy 32byte descriptor format.
+		 * Legacy 16byte descriptor is not supported. If this RXDID
+		 * is selected, return error.
+		 */
+		if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!(BIT(rxdid) & pf->supported_rxdids))
+				goto error_param;
+		} else {
+			rxdid = ICE_RXDID_LEGACY_1;
+		}
+
+		ice_write_qrxflxp_cntxt(hw, pf_q, rxdid, 0x03, false);
 	}
 
 	/* send the response to the VF */
@@ -2456,6 +2481,62 @@ error_param:
 				     v_ret, NULL, 0);
 }
 
+/**
+ * ice_vc_query_rxdid - query RXDID supported by DDP package
+ * @vf: pointer to VF info
+ *
+ * Called from VF to query a bitmap of supported flexible
+ * descriptor RXDIDs of a DDP package.
+ */
+static int ice_vc_query_rxdid(struct ice_vf *vf)
+{
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct virtchnl_supported_rxdids *rxdid = NULL;
+	struct ice_hw *hw = &vf->pf->hw;
+	struct ice_pf *pf = vf->pf;
+	int len = 0;
+	int ret, i;
+	u32 regval;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto err;
+	}
+
+	len = sizeof(struct virtchnl_supported_rxdids);
+	rxdid = kzalloc(len, GFP_KERNEL);
+	if (!rxdid) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		len = 0;
+		goto err;
+	}
+
+	/* Read flexiflag registers to determine whether the
+	 * corresponding RXDID is configured and supported or not.
+	 * Since Legacy 16byte descriptor format is not supported,
+	 * start from Legacy 32byte descriptor.
+	 */
+	for (i = ICE_RXDID_LEGACY_1; i < ICE_FLEX_DESC_RXDID_MAX_NUM; i++) {
+		regval = rd32(hw, GLFLXP_RXDID_FLAGS(i, 0));
+		if ((regval >> GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S)
+			& GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M)
+			rxdid->supported_rxdids |= BIT(i);
+	}
+
+	pf->supported_rxdids = rxdid->supported_rxdids;
+
+err:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_SUPPORTED_RXDIDS,
+				    v_ret, (u8 *)rxdid, len);
+	kfree(rxdid);
+	return ret;
+}
+
 /**
  * ice_vf_init_vlan_stripping - enable/disable VLAN stripping on initialization
  * @vf: VF to enable/disable VLAN stripping for on initialization
@@ -3490,6 +3571,7 @@ static const struct ice_virtchnl_ops ice_virtchnl_dflt_ops = {
 	.cfg_promiscuous_mode_msg = ice_vc_cfg_promiscuous_mode_msg,
 	.add_vlan_msg = ice_vc_add_vlan_msg,
 	.remove_vlan_msg = ice_vc_remove_vlan_msg,
+	.query_rxdid = ice_vc_query_rxdid,
 	.ena_vlan_stripping = ice_vc_ena_vlan_stripping,
 	.dis_vlan_stripping = ice_vc_dis_vlan_stripping,
 	.handle_rss_cfg_msg = ice_vc_handle_rss_cfg,
@@ -3624,6 +3706,7 @@ static const struct ice_virtchnl_ops ice_virtchnl_repr_ops = {
 	.cfg_promiscuous_mode_msg = ice_vc_repr_cfg_promiscuous_mode,
 	.add_vlan_msg = ice_vc_add_vlan_msg,
 	.remove_vlan_msg = ice_vc_remove_vlan_msg,
+	.query_rxdid = ice_vc_query_rxdid,
 	.ena_vlan_stripping = ice_vc_ena_vlan_stripping,
 	.dis_vlan_stripping = ice_vc_dis_vlan_stripping,
 	.handle_rss_cfg_msg = ice_vc_handle_rss_cfg,
@@ -3764,6 +3847,9 @@ error_handler:
 	case VIRTCHNL_OP_DEL_VLAN:
 		err = ops->remove_vlan_msg(vf, msg);
 		break;
+	case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS:
+		err = ops->query_rxdid(vf);
+		break;
 	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
 		err = ops->ena_vlan_stripping(vf);
 		break;
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.h b/drivers/net/ethernet/intel/ice/ice_virtchnl.h
index b5a3fd8adbb4..4867a92ebefb 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl.h
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.h
@@ -17,6 +17,7 @@
  * broadcast, and 16 for additional unicast/multicast filters
  */
 #define ICE_MAX_MACADDR_PER_VF		18
+#define ICE_FLEX_DESC_RXDID_MAX_NUM	64
 
 struct ice_virtchnl_ops {
 	int (*get_ver_msg)(struct ice_vf *vf, u8 *msg);
@@ -35,6 +36,7 @@ struct ice_virtchnl_ops {
 	int (*cfg_promiscuous_mode_msg)(struct ice_vf *vf, u8 *msg);
 	int (*add_vlan_msg)(struct ice_vf *vf, u8 *msg);
 	int (*remove_vlan_msg)(struct ice_vf *vf, u8 *msg);
+	int (*query_rxdid)(struct ice_vf *vf);
 	int (*ena_vlan_stripping)(struct ice_vf *vf);
 	int (*dis_vlan_stripping)(struct ice_vf *vf);
 	int (*handle_rss_cfg_msg)(struct ice_vf *vf, u8 *msg, bool add);
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
index 5a82216e7d03..7d547fa616fa 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
@@ -70,6 +70,11 @@ static const u32 rss_pf_allowlist_opcodes[] = {
 	VIRTCHNL_OP_GET_RSS_HENA_CAPS, VIRTCHNL_OP_SET_RSS_HENA,
 };
 
+/* VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC */
+static const u32 rx_flex_desc_allowlist_opcodes[] = {
+	VIRTCHNL_OP_GET_SUPPORTED_RXDIDS,
+};
+
 /* VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF */
 static const u32 adv_rss_pf_allowlist_opcodes[] = {
 	VIRTCHNL_OP_ADD_RSS_CFG, VIRTCHNL_OP_DEL_RSS_CFG,
@@ -96,6 +101,7 @@ static const struct allowlist_opcode_info allowlist_opcodes[] = {
 	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_REQ_QUEUES, req_queues_allowlist_opcodes),
 	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_VLAN, vlan_allowlist_opcodes),
 	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_RSS_PF, rss_pf_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC, rx_flex_desc_allowlist_opcodes),
 	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF, adv_rss_pf_allowlist_opcodes),
 	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_FDIR_PF, fdir_pf_allowlist_opcodes),
 	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_VLAN_V2, vlan_v2_allowlist_opcodes),
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 2ce27e8e4f19..d91af50ac58d 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -136,7 +136,8 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_DISABLE_CHANNELS = 31,
 	VIRTCHNL_OP_ADD_CLOUD_FILTER = 32,
 	VIRTCHNL_OP_DEL_CLOUD_FILTER = 33,
-	/* opcode 34 - 44 are reserved */
+	/* opcode 34 - 43 are reserved */
+	VIRTCHNL_OP_GET_SUPPORTED_RXDIDS = 44,
 	VIRTCHNL_OP_ADD_RSS_CFG = 45,
 	VIRTCHNL_OP_DEL_RSS_CFG = 46,
 	VIRTCHNL_OP_ADD_FDIR_FILTER = 47,
@@ -263,6 +264,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM	BIT(22)
 #define VIRTCHNL_VF_OFFLOAD_ADQ			BIT(23)
 #define VIRTCHNL_VF_OFFLOAD_USO			BIT(25)
+#define VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC	BIT(26)
 #define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF		BIT(27)
 #define VIRTCHNL_VF_OFFLOAD_FDIR_PF		BIT(28)
 
@@ -318,7 +320,9 @@ struct virtchnl_rxq_info {
 	u16 splithdr_enabled; /* deprecated with AVF 1.0 */
 	u32 databuffer_size;
 	u32 max_pkt_size;
-	u32 pad1;
+	u8 pad0;
+	u8 rxdid;
+	u8 pad1[2];
 	u64 dma_ring_addr;
 	enum virtchnl_rx_hsplit rx_split_pos; /* deprecated with AVF 1.0 */
 	u32 pad2;
@@ -970,6 +974,10 @@ struct virtchnl_filter {
 
 VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter);
 
+struct virtchnl_supported_rxdids {
+	u64 supported_rxdids;
+};
+
 /* VIRTCHNL_OP_EVENT
  * PF sends this message to inform the VF driver of events that may affect it.
  * No direct response is expected from the VF, though it may generate other
@@ -1499,6 +1507,8 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 	case VIRTCHNL_OP_DEL_CLOUD_FILTER:
 		valid_len = sizeof(struct virtchnl_filter);
 		break;
+	case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS:
+		break;
 	case VIRTCHNL_OP_ADD_RSS_CFG:
 	case VIRTCHNL_OP_DEL_RSS_CFG:
 		valid_len = sizeof(struct virtchnl_rss_cfg);
-- 
cgit v1.2.3


From 29c1c44646aec5d5134f2365259a84becc1ee7d3 Mon Sep 17 00:00:00 2001
From: Mubashir Adnan Qureshi <mubashirq@google.com>
Date: Wed, 26 Oct 2022 13:51:14 +0000
Subject: tcp: add u32 counter in tcp_sock and an SNMP counter for PLB

A u32 counter is added to tcp_sock for counting the number of PLB
triggered rehashes for a TCP connection. An SNMP counter is also
added to count overall PLB triggered rehash events for a host. These
counters are hooked up to PLB implementation for DCTCP.

TCP_NLA_REHASH is added to SCM_TIMESTAMPING_OPT_STATS that reports
the rehash attempts triggered due to PLB or timeouts. This gives
a historical view of sustained congestion or timeouts experienced
by the TCP connection.

Signed-off-by: Mubashir Adnan Qureshi <mubashirq@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h       | 1 +
 include/uapi/linux/snmp.h | 1 +
 include/uapi/linux/tcp.h  | 1 +
 net/ipv4/proc.c           | 1 +
 net/ipv4/tcp.c            | 3 +++
 net/ipv4/tcp_plb.c        | 2 ++
 6 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 41b1da621a45..ca7f05a130d2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -423,6 +423,7 @@ struct tcp_sock {
 		u32		  probe_seq_start;
 		u32		  probe_seq_end;
 	} mtu_probe;
+	u32     plb_rehash;     /* PLB-triggered rehash attempts */
 	u32	mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
 			   * while socket was owned by user.
 			   */
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 4d7470036a8b..6600cb0164c2 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -292,6 +292,7 @@ enum
 	LINUX_MIB_TCPDSACKIGNOREDDUBIOUS,	/* TCPDSACKIgnoredDubious */
 	LINUX_MIB_TCPMIGRATEREQSUCCESS,		/* TCPMigrateReqSuccess */
 	LINUX_MIB_TCPMIGRATEREQFAILURE,		/* TCPMigrateReqFailure */
+	LINUX_MIB_TCPPLBREHASH,			/* TCPPLBRehash */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 8fc09e8638b3..c9abe86eda5f 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -315,6 +315,7 @@ enum {
 	TCP_NLA_BYTES_NOTSENT,	/* Bytes in write queue not yet sent */
 	TCP_NLA_EDT,		/* Earliest departure time (CLOCK_MONOTONIC) */
 	TCP_NLA_TTL,		/* TTL or hop limit of a packet received */
+	TCP_NLA_REHASH,         /* PLB and timeout triggered rehash attempts */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 5386f460bd20..f88daace9de3 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -297,6 +297,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS),
 	SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS),
 	SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE),
+	SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ef14efa1fb70..1da7c53b6cb5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3176,6 +3176,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->sacked_out = 0;
 	tp->tlp_high_seq = 0;
 	tp->last_oow_ack_time = 0;
+	tp->plb_rehash = 0;
 	/* There's a bubble in the pipe until at least the first ACK. */
 	tp->app_limited = ~0U;
 	tp->rack.mstamp = 0;
@@ -3973,6 +3974,7 @@ static size_t tcp_opt_stats_get_size(void)
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
 		nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */
 		0;
 }
 
@@ -4049,6 +4051,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 		nla_put_u8(stats, TCP_NLA_TTL,
 			   tcp_skb_ttl_or_hop_limit(ack_skb));
 
+	nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash);
 	return stats;
 }
 
diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c
index f4ced370acad..bb1a08fda113 100644
--- a/net/ipv4/tcp_plb.c
+++ b/net/ipv4/tcp_plb.c
@@ -79,6 +79,8 @@ void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb)
 
 	sk_rethink_txhash(sk);
 	plb->consec_cong_rounds = 0;
+	tcp_sk(sk)->plb_rehash++;
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH);
 }
 EXPORT_SYMBOL_GPL(tcp_plb_check_rehash);
 
-- 
cgit v1.2.3


From 9c5a170677c3c8facc83e931a57f4c99c0511ae0 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 27 Oct 2022 14:10:37 +0100
Subject: net: phylink: add phylink_get_link_timer_ns() helper

Add a helper to convert the PHY interface mode to the required link
timer setting as stated by the appropriate standard. Inappropriate
interface modes return an error.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phylink.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 63800bf4a7ac..1df3e5e316e8 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -616,6 +616,30 @@ int phylink_speed_up(struct phylink *pl);
 
 void phylink_set_port_modes(unsigned long *bits);
 
+/**
+ * phylink_get_link_timer_ns - return the PCS link timer value
+ * @interface: link &typedef phy_interface_t mode
+ *
+ * Return the PCS link timer setting in nanoseconds for the PHY @interface
+ * mode, or -EINVAL if not appropriate.
+ */
+static inline int phylink_get_link_timer_ns(phy_interface_t interface)
+{
+	switch (interface) {
+	case PHY_INTERFACE_MODE_SGMII:
+	case PHY_INTERFACE_MODE_QSGMII:
+	case PHY_INTERFACE_MODE_USXGMII:
+		return 1600000;
+
+	case PHY_INTERFACE_MODE_1000BASEX:
+	case PHY_INTERFACE_MODE_2500BASEX:
+		return 10000000;
+
+	default:
+		return -EINVAL;
+	}
+}
+
 void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state,
 				      u16 bmsr, u16 lpa);
 void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs,
-- 
cgit v1.2.3


From 17dd361119e5bc4cfb85aed660674a846b613817 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 27 Oct 2022 14:21:16 +0100
Subject: net: sfp: convert register indexes from hex to decimal

The register indexes in the standards are in decimal rather than hex,
so lets specify them in decimal in the header file so we can easily
cross-reference without converting between hex and decimal.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/sfp.h | 180 ++++++++++++++++++++++++++--------------------------
 1 file changed, 90 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 01ae9f1dd2ad..4e2db155664d 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -332,37 +332,37 @@ enum {
 
 /* SFP EEPROM registers */
 enum {
-	SFP_PHYS_ID			= 0x00,
-	SFP_PHYS_EXT_ID			= 0x01,
-	SFP_CONNECTOR			= 0x02,
-	SFP_COMPLIANCE			= 0x03,
-	SFP_ENCODING			= 0x0b,
-	SFP_BR_NOMINAL			= 0x0c,
-	SFP_RATE_ID			= 0x0d,
-	SFP_LINK_LEN_SM_KM		= 0x0e,
-	SFP_LINK_LEN_SM_100M		= 0x0f,
-	SFP_LINK_LEN_50UM_OM2_10M	= 0x10,
-	SFP_LINK_LEN_62_5UM_OM1_10M	= 0x11,
-	SFP_LINK_LEN_COPPER_1M		= 0x12,
-	SFP_LINK_LEN_50UM_OM4_10M	= 0x12,
-	SFP_LINK_LEN_50UM_OM3_10M	= 0x13,
-	SFP_VENDOR_NAME			= 0x14,
-	SFP_VENDOR_OUI			= 0x25,
-	SFP_VENDOR_PN			= 0x28,
-	SFP_VENDOR_REV			= 0x38,
-	SFP_OPTICAL_WAVELENGTH_MSB	= 0x3c,
-	SFP_OPTICAL_WAVELENGTH_LSB	= 0x3d,
-	SFP_CABLE_SPEC			= 0x3c,
-	SFP_CC_BASE			= 0x3f,
-	SFP_OPTIONS			= 0x40,	/* 2 bytes, MSB, LSB */
-	SFP_BR_MAX			= 0x42,
-	SFP_BR_MIN			= 0x43,
-	SFP_VENDOR_SN			= 0x44,
-	SFP_DATECODE			= 0x54,
-	SFP_DIAGMON			= 0x5c,
-	SFP_ENHOPTS			= 0x5d,
-	SFP_SFF8472_COMPLIANCE		= 0x5e,
-	SFP_CC_EXT			= 0x5f,
+	SFP_PHYS_ID			= 0,
+	SFP_PHYS_EXT_ID			= 1,
+	SFP_CONNECTOR			= 2,
+	SFP_COMPLIANCE			= 3,
+	SFP_ENCODING			= 11,
+	SFP_BR_NOMINAL			= 12,
+	SFP_RATE_ID			= 13,
+	SFP_LINK_LEN_SM_KM		= 14,
+	SFP_LINK_LEN_SM_100M		= 15,
+	SFP_LINK_LEN_50UM_OM2_10M	= 16,
+	SFP_LINK_LEN_62_5UM_OM1_10M	= 17,
+	SFP_LINK_LEN_COPPER_1M		= 18,
+	SFP_LINK_LEN_50UM_OM4_10M	= 18,
+	SFP_LINK_LEN_50UM_OM3_10M	= 19,
+	SFP_VENDOR_NAME			= 20,
+	SFP_VENDOR_OUI			= 37,
+	SFP_VENDOR_PN			= 40,
+	SFP_VENDOR_REV			= 56,
+	SFP_OPTICAL_WAVELENGTH_MSB	= 60,
+	SFP_OPTICAL_WAVELENGTH_LSB	= 61,
+	SFP_CABLE_SPEC			= 60,
+	SFP_CC_BASE			= 63,
+	SFP_OPTIONS			= 64,	/* 2 bytes, MSB, LSB */
+	SFP_BR_MAX			= 66,
+	SFP_BR_MIN			= 67,
+	SFP_VENDOR_SN			= 68,
+	SFP_DATECODE			= 84,
+	SFP_DIAGMON			= 92,
+	SFP_ENHOPTS			= 93,
+	SFP_SFF8472_COMPLIANCE		= 94,
+	SFP_CC_EXT			= 95,
 
 	SFP_PHYS_EXT_ID_SFP		= 0x04,
 	SFP_OPTIONS_HIGH_POWER_LEVEL	= BIT(13),
@@ -404,63 +404,63 @@ enum {
 /* SFP Diagnostics */
 enum {
 	/* Alarm and warnings stored MSB at lower address then LSB */
-	SFP_TEMP_HIGH_ALARM		= 0x00,
-	SFP_TEMP_LOW_ALARM		= 0x02,
-	SFP_TEMP_HIGH_WARN		= 0x04,
-	SFP_TEMP_LOW_WARN		= 0x06,
-	SFP_VOLT_HIGH_ALARM		= 0x08,
-	SFP_VOLT_LOW_ALARM		= 0x0a,
-	SFP_VOLT_HIGH_WARN		= 0x0c,
-	SFP_VOLT_LOW_WARN		= 0x0e,
-	SFP_BIAS_HIGH_ALARM		= 0x10,
-	SFP_BIAS_LOW_ALARM		= 0x12,
-	SFP_BIAS_HIGH_WARN		= 0x14,
-	SFP_BIAS_LOW_WARN		= 0x16,
-	SFP_TXPWR_HIGH_ALARM		= 0x18,
-	SFP_TXPWR_LOW_ALARM		= 0x1a,
-	SFP_TXPWR_HIGH_WARN		= 0x1c,
-	SFP_TXPWR_LOW_WARN		= 0x1e,
-	SFP_RXPWR_HIGH_ALARM		= 0x20,
-	SFP_RXPWR_LOW_ALARM		= 0x22,
-	SFP_RXPWR_HIGH_WARN		= 0x24,
-	SFP_RXPWR_LOW_WARN		= 0x26,
-	SFP_LASER_TEMP_HIGH_ALARM	= 0x28,
-	SFP_LASER_TEMP_LOW_ALARM	= 0x2a,
-	SFP_LASER_TEMP_HIGH_WARN	= 0x2c,
-	SFP_LASER_TEMP_LOW_WARN		= 0x2e,
-	SFP_TEC_CUR_HIGH_ALARM		= 0x30,
-	SFP_TEC_CUR_LOW_ALARM		= 0x32,
-	SFP_TEC_CUR_HIGH_WARN		= 0x34,
-	SFP_TEC_CUR_LOW_WARN		= 0x36,
-	SFP_CAL_RXPWR4			= 0x38,
-	SFP_CAL_RXPWR3			= 0x3c,
-	SFP_CAL_RXPWR2			= 0x40,
-	SFP_CAL_RXPWR1			= 0x44,
-	SFP_CAL_RXPWR0			= 0x48,
-	SFP_CAL_TXI_SLOPE		= 0x4c,
-	SFP_CAL_TXI_OFFSET		= 0x4e,
-	SFP_CAL_TXPWR_SLOPE		= 0x50,
-	SFP_CAL_TXPWR_OFFSET		= 0x52,
-	SFP_CAL_T_SLOPE			= 0x54,
-	SFP_CAL_T_OFFSET		= 0x56,
-	SFP_CAL_V_SLOPE			= 0x58,
-	SFP_CAL_V_OFFSET		= 0x5a,
-	SFP_CHKSUM			= 0x5f,
-
-	SFP_TEMP			= 0x60,
-	SFP_VCC				= 0x62,
-	SFP_TX_BIAS			= 0x64,
-	SFP_TX_POWER			= 0x66,
-	SFP_RX_POWER			= 0x68,
-	SFP_LASER_TEMP			= 0x6a,
-	SFP_TEC_CUR			= 0x6c,
-
-	SFP_STATUS			= 0x6e,
+	SFP_TEMP_HIGH_ALARM		= 0,
+	SFP_TEMP_LOW_ALARM		= 2,
+	SFP_TEMP_HIGH_WARN		= 4,
+	SFP_TEMP_LOW_WARN		= 6,
+	SFP_VOLT_HIGH_ALARM		= 8,
+	SFP_VOLT_LOW_ALARM		= 10,
+	SFP_VOLT_HIGH_WARN		= 12,
+	SFP_VOLT_LOW_WARN		= 14,
+	SFP_BIAS_HIGH_ALARM		= 16,
+	SFP_BIAS_LOW_ALARM		= 18,
+	SFP_BIAS_HIGH_WARN		= 20,
+	SFP_BIAS_LOW_WARN		= 22,
+	SFP_TXPWR_HIGH_ALARM		= 24,
+	SFP_TXPWR_LOW_ALARM		= 26,
+	SFP_TXPWR_HIGH_WARN		= 28,
+	SFP_TXPWR_LOW_WARN		= 30,
+	SFP_RXPWR_HIGH_ALARM		= 32,
+	SFP_RXPWR_LOW_ALARM		= 34,
+	SFP_RXPWR_HIGH_WARN		= 36,
+	SFP_RXPWR_LOW_WARN		= 38,
+	SFP_LASER_TEMP_HIGH_ALARM	= 40,
+	SFP_LASER_TEMP_LOW_ALARM	= 42,
+	SFP_LASER_TEMP_HIGH_WARN	= 44,
+	SFP_LASER_TEMP_LOW_WARN		= 46,
+	SFP_TEC_CUR_HIGH_ALARM		= 48,
+	SFP_TEC_CUR_LOW_ALARM		= 50,
+	SFP_TEC_CUR_HIGH_WARN		= 52,
+	SFP_TEC_CUR_LOW_WARN		= 54,
+	SFP_CAL_RXPWR4			= 56,
+	SFP_CAL_RXPWR3			= 60,
+	SFP_CAL_RXPWR2			= 64,
+	SFP_CAL_RXPWR1			= 68,
+	SFP_CAL_RXPWR0			= 72,
+	SFP_CAL_TXI_SLOPE		= 76,
+	SFP_CAL_TXI_OFFSET		= 78,
+	SFP_CAL_TXPWR_SLOPE		= 80,
+	SFP_CAL_TXPWR_OFFSET		= 82,
+	SFP_CAL_T_SLOPE			= 84,
+	SFP_CAL_T_OFFSET		= 86,
+	SFP_CAL_V_SLOPE			= 88,
+	SFP_CAL_V_OFFSET		= 90,
+	SFP_CHKSUM			= 95,
+
+	SFP_TEMP			= 96,
+	SFP_VCC				= 98,
+	SFP_TX_BIAS			= 100,
+	SFP_TX_POWER			= 102,
+	SFP_RX_POWER			= 104,
+	SFP_LASER_TEMP			= 106,
+	SFP_TEC_CUR			= 108,
+
+	SFP_STATUS			= 110,
 	SFP_STATUS_TX_DISABLE		= BIT(7),
 	SFP_STATUS_TX_DISABLE_FORCE	= BIT(6),
 	SFP_STATUS_TX_FAULT		= BIT(2),
 	SFP_STATUS_RX_LOS		= BIT(1),
-	SFP_ALARM0			= 0x70,
+	SFP_ALARM0			= 112,
 	SFP_ALARM0_TEMP_HIGH		= BIT(7),
 	SFP_ALARM0_TEMP_LOW		= BIT(6),
 	SFP_ALARM0_VCC_HIGH		= BIT(5),
@@ -470,11 +470,11 @@ enum {
 	SFP_ALARM0_TXPWR_HIGH		= BIT(1),
 	SFP_ALARM0_TXPWR_LOW		= BIT(0),
 
-	SFP_ALARM1			= 0x71,
+	SFP_ALARM1			= 113,
 	SFP_ALARM1_RXPWR_HIGH		= BIT(7),
 	SFP_ALARM1_RXPWR_LOW		= BIT(6),
 
-	SFP_WARN0			= 0x74,
+	SFP_WARN0			= 116,
 	SFP_WARN0_TEMP_HIGH		= BIT(7),
 	SFP_WARN0_TEMP_LOW		= BIT(6),
 	SFP_WARN0_VCC_HIGH		= BIT(5),
@@ -484,15 +484,15 @@ enum {
 	SFP_WARN0_TXPWR_HIGH		= BIT(1),
 	SFP_WARN0_TXPWR_LOW		= BIT(0),
 
-	SFP_WARN1			= 0x75,
+	SFP_WARN1			= 117,
 	SFP_WARN1_RXPWR_HIGH		= BIT(7),
 	SFP_WARN1_RXPWR_LOW		= BIT(6),
 
-	SFP_EXT_STATUS			= 0x76,
+	SFP_EXT_STATUS			= 118,
 	SFP_EXT_STATUS_PWRLVL_SELECT	= BIT(0),
 
-	SFP_VSL				= 0x78,
-	SFP_PAGE			= 0x7f,
+	SFP_VSL				= 120,
+	SFP_PAGE			= 127,
 };
 
 struct fwnode_handle;
-- 
cgit v1.2.3


From d83845d224a059165902ecd0e1203015c5713a78 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 27 Oct 2022 14:21:21 +0100
Subject: net: sfp: move field definitions along side register index

Just as we do for the A2h enum, arrange the A0h enum to have the
field definitions next to their corresponding register index.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/sfp.h | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 4e2db155664d..52b98f9666a2 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -333,7 +333,10 @@ enum {
 /* SFP EEPROM registers */
 enum {
 	SFP_PHYS_ID			= 0,
+
 	SFP_PHYS_EXT_ID			= 1,
+	SFP_PHYS_EXT_ID_SFP		= 0x04,
+
 	SFP_CONNECTOR			= 2,
 	SFP_COMPLIANCE			= 3,
 	SFP_ENCODING			= 11,
@@ -354,17 +357,8 @@ enum {
 	SFP_OPTICAL_WAVELENGTH_LSB	= 61,
 	SFP_CABLE_SPEC			= 60,
 	SFP_CC_BASE			= 63,
-	SFP_OPTIONS			= 64,	/* 2 bytes, MSB, LSB */
-	SFP_BR_MAX			= 66,
-	SFP_BR_MIN			= 67,
-	SFP_VENDOR_SN			= 68,
-	SFP_DATECODE			= 84,
-	SFP_DIAGMON			= 92,
-	SFP_ENHOPTS			= 93,
-	SFP_SFF8472_COMPLIANCE		= 94,
-	SFP_CC_EXT			= 95,
 
-	SFP_PHYS_EXT_ID_SFP		= 0x04,
+	SFP_OPTIONS			= 64,	/* 2 bytes, MSB, LSB */
 	SFP_OPTIONS_HIGH_POWER_LEVEL	= BIT(13),
 	SFP_OPTIONS_PAGING_A2		= BIT(12),
 	SFP_OPTIONS_RETIMER		= BIT(11),
@@ -378,11 +372,20 @@ enum {
 	SFP_OPTIONS_TX_FAULT		= BIT(3),
 	SFP_OPTIONS_LOS_INVERTED	= BIT(2),
 	SFP_OPTIONS_LOS_NORMAL		= BIT(1),
+
+	SFP_BR_MAX			= 66,
+	SFP_BR_MIN			= 67,
+	SFP_VENDOR_SN			= 68,
+	SFP_DATECODE			= 84,
+
+	SFP_DIAGMON			= 92,
 	SFP_DIAGMON_DDM			= BIT(6),
 	SFP_DIAGMON_INT_CAL		= BIT(5),
 	SFP_DIAGMON_EXT_CAL		= BIT(4),
 	SFP_DIAGMON_RXPWR_AVG		= BIT(3),
 	SFP_DIAGMON_ADDRMODE		= BIT(2),
+
+	SFP_ENHOPTS			= 93,
 	SFP_ENHOPTS_ALARMWARN		= BIT(7),
 	SFP_ENHOPTS_SOFT_TX_DISABLE	= BIT(6),
 	SFP_ENHOPTS_SOFT_TX_FAULT	= BIT(5),
@@ -390,6 +393,8 @@ enum {
 	SFP_ENHOPTS_SOFT_RATE_SELECT	= BIT(3),
 	SFP_ENHOPTS_APP_SELECT_SFF8079	= BIT(2),
 	SFP_ENHOPTS_SOFT_RATE_SFF8431	= BIT(1),
+
+	SFP_SFF8472_COMPLIANCE		= 94,
 	SFP_SFF8472_COMPLIANCE_NONE	= 0x00,
 	SFP_SFF8472_COMPLIANCE_REV9_3	= 0x01,
 	SFP_SFF8472_COMPLIANCE_REV9_5	= 0x02,
@@ -399,6 +404,8 @@ enum {
 	SFP_SFF8472_COMPLIANCE_REV11_3	= 0x06,
 	SFP_SFF8472_COMPLIANCE_REV11_4	= 0x07,
 	SFP_SFF8472_COMPLIANCE_REV12_0	= 0x08,
+
+	SFP_CC_EXT			= 95,
 };
 
 /* SFP Diagnostics */
-- 
cgit v1.2.3


From f3fb589aeb88c5bf7a7a804d36a8fa219b72e290 Mon Sep 17 00:00:00 2001
From: Juhee Kang <claudiajkang@gmail.com>
Date: Fri, 28 Oct 2022 01:04:24 +0900
Subject: net: remove unused netdev_unregistering()

Currently, use dev->reg_state == NETREG_UNREGISTERING to check the status
which is NETREG_UNREGISTERING, rather than using netdev_unregistering.
Also, A helper function which is netdev_unregistering on nedevice.h is no
longer used. Thus, netdev_unregistering removes from netdevice.h.

Signed-off-by: Juhee Kang <claudiajkang@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eddf8ee270e7..99e58b773266 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -5101,11 +5101,6 @@ static inline const char *netdev_name(const struct net_device *dev)
 	return dev->name;
 }
 
-static inline bool netdev_unregistering(const struct net_device *dev)
-{
-	return dev->reg_state == NETREG_UNREGISTERING;
-}
-
 static inline const char *netdev_reg_state(const struct net_device *dev)
 {
 	switch (dev->reg_state) {
-- 
cgit v1.2.3


From b9a61b97798ca6d0c856a217e61e2177bd8f6eae Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Fri, 28 Oct 2022 04:04:12 -0700
Subject: ptp: add missing documentation for parameters

The ptp_find_pin_unlocked function and the ptp_system_timestamp structure
didn't document their parameters and fields. Fix this.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptp_clock_kernel.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 92b44161408e..ad4aaadc2f7a 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -45,6 +45,8 @@ struct system_device_crosststamp;
 
 /**
  * struct ptp_system_timestamp - system time corresponding to a PHC timestamp
+ * @pre_ts: system timestamp before capturing PHC
+ * @post_ts: system timestamp after capturing PHC
  */
 struct ptp_system_timestamp {
 	struct timespec64 pre_ts;
@@ -316,6 +318,11 @@ int ptp_find_pin(struct ptp_clock *ptp,
  * should most likely call ptp_find_pin() directly from their
  * ptp_clock_info::enable() method.
  *
+* @ptp:    The clock obtained from ptp_clock_register().
+* @func:   One of the ptp_pin_function enumerated values.
+* @chan:   The particular functional channel to find.
+* Return:  Pin index in the range of zero to ptp_clock_caps.n_pins - 1,
+*          or -1 if the auxiliary function cannot be found.
  */
 
 int ptp_find_pin_unlocked(struct ptp_clock *ptp,
-- 
cgit v1.2.3


From 1060707e380994e7c9b754b6eb74f25459b4a5b3 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Fri, 28 Oct 2022 04:04:13 -0700
Subject: ptp: introduce helpers to adjust by scaled parts per million

Many drivers implement the .adjfreq or .adjfine PTP op function with the
same basic logic:

  1. Determine a base frequency value
  2. Multiply this by the abs() of the requested adjustment, then divide by
     the appropriate divisor (1 billion, or 65,536 billion).
  3. Add or subtract this difference from the base frequency to calculate a
     new adjustment.

A few drivers need the difference and direction rather than the combined
new increment value.

I recently converted the Intel drivers to .adjfine and the scaled parts per
million (65.536 parts per billion) logic. To avoid overflow with minimal
loss of precision, mul_u64_u64_div_u64 was used.

The basic logic used by all of these drivers is very similar, and leads to
a lot of duplicate code to perform the same task.

Rather than keep this duplicate code, introduce diff_by_scaled_ppm and
adjust_by_scaled_ppm. These helper functions calculate the difference or
adjustment necessary based on the scaled parts per million input.

The diff_by_scaled_ppm function returns true if the difference should be
subtracted, and false otherwise.

Update the Intel drivers to use the new helper functions. Other vendor
drivers will be converted to .adjfine and this helper function in the
following changes.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/e1000e/ptp.c      | 16 ++--------
 drivers/net/ethernet/intel/i40e/i40e_ptp.c   | 17 ++--------
 drivers/net/ethernet/intel/ice/ice_ptp.c     | 18 ++---------
 drivers/net/ethernet/intel/igb/igb_ptp.c     | 18 ++---------
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c | 24 +++------------
 include/linux/ptp_clock_kernel.h             | 46 ++++++++++++++++++++++++++++
 6 files changed, 60 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c
index 0e488e4fa5c1..6e5a1720e6cd 100644
--- a/drivers/net/ethernet/intel/e1000e/ptp.c
+++ b/drivers/net/ethernet/intel/e1000e/ptp.c
@@ -29,17 +29,11 @@ static int e1000e_phc_adjfine(struct ptp_clock_info *ptp, long delta)
 	struct e1000_adapter *adapter = container_of(ptp, struct e1000_adapter,
 						     ptp_clock_info);
 	struct e1000_hw *hw = &adapter->hw;
-	bool neg_adj = false;
 	unsigned long flags;
-	u64 adjustment;
-	u32 timinca, incvalue;
+	u64 incvalue;
+	u32 timinca;
 	s32 ret_val;
 
-	if (delta < 0) {
-		neg_adj = true;
-		delta = -delta;
-	}
-
 	/* Get the System Time Register SYSTIM base frequency */
 	ret_val = e1000e_get_base_timinca(adapter, &timinca);
 	if (ret_val)
@@ -48,11 +42,7 @@ static int e1000e_phc_adjfine(struct ptp_clock_info *ptp, long delta)
 	spin_lock_irqsave(&adapter->systim_lock, flags);
 
 	incvalue = timinca & E1000_TIMINCA_INCVALUE_MASK;
-
-	adjustment = mul_u64_u64_div_u64(incvalue, (u64)delta,
-					 1000000ULL << 16);
-
-	incvalue = neg_adj ? (incvalue - adjustment) : (incvalue + adjustment);
+	incvalue = adjust_by_scaled_ppm(incvalue, delta);
 
 	timinca &= ~E1000_TIMINCA_INCVALUE_MASK;
 	timinca |= incvalue;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index ffea0c9c82f1..c37abbb3cd06 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -347,23 +347,12 @@ static int i40e_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
 {
 	struct i40e_pf *pf = container_of(ptp, struct i40e_pf, ptp_caps);
 	struct i40e_hw *hw = &pf->hw;
-	u64 adj, freq, diff;
-	int neg_adj = 0;
-
-	if (scaled_ppm < 0) {
-		neg_adj = 1;
-		scaled_ppm = -scaled_ppm;
-	}
+	u64 adj, base_adj;
 
 	smp_mb(); /* Force any pending update before accessing. */
-	freq = I40E_PTP_40GB_INCVAL * READ_ONCE(pf->ptp_adj_mult);
-	diff = mul_u64_u64_div_u64(freq, (u64)scaled_ppm,
-				   1000000ULL << 16);
+	base_adj = I40E_PTP_40GB_INCVAL * READ_ONCE(pf->ptp_adj_mult);
 
-	if (neg_adj)
-		adj = I40E_PTP_40GB_INCVAL - diff;
-	else
-		adj = I40E_PTP_40GB_INCVAL + diff;
+	adj = adjust_by_scaled_ppm(base_adj, scaled_ppm);
 
 	wr32(hw, I40E_PRTTSYN_INC_L, adj & 0xFFFFFFFF);
 	wr32(hw, I40E_PRTTSYN_INC_H, adj >> 32);
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c
index 011b727ab190..5cf198a33e26 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp.c
@@ -1444,24 +1444,10 @@ static int ice_ptp_adjfine(struct ptp_clock_info *info, long scaled_ppm)
 {
 	struct ice_pf *pf = ptp_info_to_pf(info);
 	struct ice_hw *hw = &pf->hw;
-	u64 incval, diff;
-	int neg_adj = 0;
+	u64 incval;
 	int err;
 
-	incval = ice_base_incval(pf);
-
-	if (scaled_ppm < 0) {
-		neg_adj = 1;
-		scaled_ppm = -scaled_ppm;
-	}
-
-	diff = mul_u64_u64_div_u64(incval, (u64)scaled_ppm,
-				   1000000ULL << 16);
-	if (neg_adj)
-		incval -= diff;
-	else
-		incval += diff;
-
+	incval = adjust_by_scaled_ppm(ice_base_incval(pf), scaled_ppm);
 	err = ice_ptp_write_incval_locked(hw, incval);
 	if (err) {
 		dev_err(ice_pf_to_dev(pf), "PTP failed to set incval, err %d\n",
diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c
index 15e57460e19e..6f471b91f562 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -195,23 +195,9 @@ static int igb_ptp_adjfine_82576(struct ptp_clock_info *ptp, long scaled_ppm)
 	struct igb_adapter *igb = container_of(ptp, struct igb_adapter,
 					       ptp_caps);
 	struct e1000_hw *hw = &igb->hw;
-	int neg_adj = 0;
-	u64 rate;
-	u32 incvalue;
-
-	if (scaled_ppm < 0) {
-		neg_adj = 1;
-		scaled_ppm = -scaled_ppm;
-	}
-
-	incvalue = INCVALUE_82576;
-	rate = mul_u64_u64_div_u64(incvalue, (u64)scaled_ppm,
-				   1000000ULL << 16);
+	u64 incvalue;
 
-	if (neg_adj)
-		incvalue -= rate;
-	else
-		incvalue += rate;
+	incvalue = adjust_by_scaled_ppm(INCVALUE_82576, scaled_ppm);
 
 	wr32(E1000_TIMINCA, INCPERIOD_82576 | (incvalue & INCVALUE_82576_MASK));
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index f8605f57bd06..fd3f77a9e28d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -451,21 +451,11 @@ static int ixgbe_ptp_adjfine_82599(struct ptp_clock_info *ptp, long scaled_ppm)
 	struct ixgbe_adapter *adapter =
 		container_of(ptp, struct ixgbe_adapter, ptp_caps);
 	struct ixgbe_hw *hw = &adapter->hw;
-	u64 incval, diff;
-	int neg_adj = 0;
-
-	if (scaled_ppm < 0) {
-		neg_adj = 1;
-		scaled_ppm = -scaled_ppm;
-	}
+	u64 incval;
 
 	smp_mb();
 	incval = READ_ONCE(adapter->base_incval);
-
-	diff = mul_u64_u64_div_u64(incval, scaled_ppm,
-				   1000000ULL << 16);
-
-	incval = neg_adj ? (incval - diff) : (incval + diff);
+	incval = adjust_by_scaled_ppm(incval, scaled_ppm);
 
 	switch (hw->mac.type) {
 	case ixgbe_mac_X540:
@@ -502,17 +492,11 @@ static int ixgbe_ptp_adjfine_X550(struct ptp_clock_info *ptp, long scaled_ppm)
 	struct ixgbe_adapter *adapter =
 			container_of(ptp, struct ixgbe_adapter, ptp_caps);
 	struct ixgbe_hw *hw = &adapter->hw;
-	int neg_adj = 0;
+	bool neg_adj;
 	u64 rate;
 	u32 inca;
 
-	if (scaled_ppm < 0) {
-		neg_adj = 1;
-		scaled_ppm = -scaled_ppm;
-	}
-
-	rate = mul_u64_u64_div_u64(IXGBE_X550_BASE_PERIOD, scaled_ppm,
-				   1000000ULL << 16);
+	neg_adj = diff_by_scaled_ppm(IXGBE_X550_BASE_PERIOD, scaled_ppm, &rate);
 
 	/* warn if rate is too large */
 	if (rate >= INCVALUE_MASK)
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index ad4aaadc2f7a..f4781c5766d6 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -248,6 +248,52 @@ static inline long scaled_ppm_to_ppb(long ppm)
 	return (long)ppb;
 }
 
+/**
+ * diff_by_scaled_ppm - Calculate difference using scaled ppm
+ * @base: the base increment value to adjust
+ * @scaled_ppm: scaled parts per million to adjust by
+ * @diff: on return, the absolute value of calculated diff
+ *
+ * Calculate the difference to adjust the base increment using scaled parts
+ * per million.
+ *
+ * Use mul_u64_u64_div_u64 to perform the difference calculation in avoid
+ * possible overflow.
+ *
+ * Returns: true if scaled_ppm is negative, false otherwise
+ */
+static inline bool diff_by_scaled_ppm(u64 base, long scaled_ppm, u64 *diff)
+{
+	bool negative = false;
+
+	if (scaled_ppm < 0) {
+		negative = true;
+		scaled_ppm = -scaled_ppm;
+	}
+
+	*diff = mul_u64_u64_div_u64(base, (u64)scaled_ppm, 1000000ULL << 16);
+
+	return negative;
+}
+
+/**
+ * adjust_by_scaled_ppm - Adjust a base increment by scaled parts per million
+ * @base: the base increment value to adjust
+ * @scaled_ppm: scaled parts per million frequency adjustment
+ *
+ * Helper function which calculates a new increment value based on the
+ * requested scaled parts per million adjustment.
+ */
+static inline u64 adjust_by_scaled_ppm(u64 base, long scaled_ppm)
+{
+	u64 diff;
+
+	if (diff_by_scaled_ppm(base, scaled_ppm, &diff))
+		return base - diff;
+
+	return base + diff;
+}
+
 #if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
 
 /**
-- 
cgit v1.2.3


From 1d997f1013079c05b642c739901e3584a3ae558d Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 28 Oct 2022 04:42:21 -0400
Subject: rtnetlink: pass netlink message header and portid to
 rtnl_configure_link()

This patch pass netlink message header and portid to rtnl_configure_link()
All the functions in this call chain need to add the parameters so we can
use them in the last call rtnl_notify(), and notify the userspace about
the new link info if NLM_F_ECHO flag is set.

- rtnl_configure_link()
  - __dev_notify_flags()
    - rtmsg_ifinfo()
      - rtmsg_ifinfo_event()
        - rtmsg_ifinfo_build_skb()
        - rtmsg_ifinfo_send()
	  - rtnl_notify()

Also move __dev_notify_flags() declaration to net/core/dev.h, as Jakub
suggested.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/can/vxcan.c        |  2 +-
 drivers/net/geneve.c           |  2 +-
 drivers/net/veth.c             |  2 +-
 drivers/net/vxlan/vxlan_core.c |  4 ++--
 drivers/net/wwan/wwan_core.c   |  2 +-
 include/linux/netdevice.h      |  2 --
 include/linux/rtnetlink.h      |  9 +++++----
 include/net/netlink.h          | 11 +++++++++++
 include/net/rtnetlink.h        |  3 ++-
 net/core/dev.c                 | 25 +++++++++++++------------
 net/core/dev.h                 |  4 ++++
 net/core/rtnetlink.c           | 35 +++++++++++++++++++----------------
 net/ipv4/ip_gre.c              |  2 +-
 13 files changed, 61 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index 26a472d2ea58..4068d962203d 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -236,7 +236,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,
 
 	netif_carrier_off(peer);
 
-	err = rtnl_configure_link(peer, ifmp);
+	err = rtnl_configure_link(peer, ifmp, 0, NULL);
 	if (err < 0)
 		goto unregister_network_device;
 
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index f393e454f45c..89ff7f8e8c7e 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1907,7 +1907,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 	if (err)
 		goto err;
 
-	err = rtnl_configure_link(dev, NULL);
+	err = rtnl_configure_link(dev, NULL, 0, NULL);
 	if (err < 0)
 		goto err;
 
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 740506c44427..ac7c0653695f 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1773,7 +1773,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 	veth_disable_gro(peer);
 	netif_carrier_off(peer);
 
-	err = rtnl_configure_link(peer, ifmp);
+	err = rtnl_configure_link(peer, ifmp, 0, NULL);
 	if (err < 0)
 		goto err_configure_peer;
 
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 6ab669dcd1c6..92224b36787a 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -3794,7 +3794,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev,
 			goto errout;
 	}
 
-	err = rtnl_configure_link(dev, NULL);
+	err = rtnl_configure_link(dev, NULL, 0, NULL);
 	if (err < 0)
 		goto unlink;
 
@@ -4416,7 +4416,7 @@ struct net_device *vxlan_dev_create(struct net *net, const char *name,
 		return ERR_PTR(err);
 	}
 
-	err = rtnl_configure_link(dev, NULL);
+	err = rtnl_configure_link(dev, NULL, 0, NULL);
 	if (err < 0) {
 		LIST_HEAD(list_kill);
 
diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index 62e9f7d6c9fe..d72ee18476d1 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -1058,7 +1058,7 @@ static void wwan_create_default_link(struct wwan_device *wwandev,
 		goto unlock;
 	}
 
-	rtnl_configure_link(dev, NULL); /* Link initialized, notify new link */
+	rtnl_configure_link(dev, NULL, 0, NULL); /* Link initialized, notify new link */
 
 unlock:
 	rtnl_unlock();
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 99e58b773266..4b5052db978f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3855,8 +3855,6 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
 		       struct netlink_ext_ack *extack);
 int dev_change_flags(struct net_device *dev, unsigned int flags,
 		     struct netlink_ext_ack *extack);
-void __dev_notify_flags(struct net_device *, unsigned int old_flags,
-			unsigned int gchanges);
 int dev_set_alias(struct net_device *, const char *, size_t);
 int dev_get_alias(const struct net_device *, char *, size_t);
 int __dev_change_net_namespace(struct net_device *dev, struct net *net,
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index ae2c6a3cec5d..92ad75549e9c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -12,21 +12,22 @@
 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
 extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
 extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
-			u32 group, struct nlmsghdr *nlh, gfp_t flags);
+			u32 group, const struct nlmsghdr *nlh, gfp_t flags);
 extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
 extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, long expires, u32 error);
 
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags,
+		  u32 portid, const struct nlmsghdr *nlh);
 void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
 			 gfp_t flags, int *new_nsid, int new_ifindex);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned change, u32 event,
 				       gfp_t flags, int *new_nsid,
-				       int new_ifindex);
+				       int new_ifindex, u32 portid, u32 seq);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
-		       gfp_t flags);
+		       gfp_t flags, u32 portid, const struct nlmsghdr *nlh);
 
 
 /* RTNL is used as a global lock for all changes to network configuration  */
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 784b4688fc6f..464e2e026f7b 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -899,6 +899,17 @@ static inline int nlmsg_report(const struct nlmsghdr *nlh)
 	return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0;
 }
 
+/**
+ * nlmsg_seq - return the seq number of netlink message
+ * @nlh: netlink message header
+ *
+ * Returns 0 if netlink message is NULL
+ */
+static inline u32 nlmsg_seq(const struct nlmsghdr *nlh)
+{
+	return nlh ? nlh->nlmsg_seq : 0;
+}
+
 /**
  * nlmsg_for_each_attr - iterate over a stream of attributes
  * @pos: loop counter, set to current attribute
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index bf8bb3357825..cd94f65dc2a9 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -187,7 +187,8 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
 				    struct nlattr *tb[],
 				    struct netlink_ext_ack *extack);
 int rtnl_delete_link(struct net_device *dev);
-int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm);
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
+			u32 portid, const struct nlmsghdr *nlh);
 
 int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
 			struct netlink_ext_ack *exterr);
diff --git a/net/core/dev.c b/net/core/dev.c
index cfb68db040a4..19e0db536022 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1333,7 +1333,7 @@ void netdev_state_change(struct net_device *dev)
 
 		call_netdevice_notifiers_info(NETDEV_CHANGE,
 					      &change_info.info);
-		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
+		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
 	}
 }
 EXPORT_SYMBOL(netdev_state_change);
@@ -1469,7 +1469,7 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 	if (ret < 0)
 		return ret;
 
-	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 	call_netdevice_notifiers(NETDEV_UP, dev);
 
 	return ret;
@@ -1541,7 +1541,7 @@ void dev_close_many(struct list_head *head, bool unlink)
 	__dev_close_many(head);
 
 	list_for_each_entry_safe(dev, tmp, head, close_list) {
-		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 		call_netdevice_notifiers(NETDEV_DOWN, dev);
 		if (unlink)
 			list_del_init(&dev->close_list);
@@ -8351,7 +8351,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 		dev_change_rx_flags(dev, IFF_PROMISC);
 	}
 	if (notify)
-		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
+		__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
 	return 0;
 }
 
@@ -8406,7 +8406,7 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 		dev_set_rx_mode(dev);
 		if (notify)
 			__dev_notify_flags(dev, old_flags,
-					   dev->gflags ^ old_gflags);
+					   dev->gflags ^ old_gflags, 0, NULL);
 	}
 	return 0;
 }
@@ -8569,12 +8569,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
 }
 
 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
-			unsigned int gchanges)
+			unsigned int gchanges, u32 portid,
+			const struct nlmsghdr *nlh)
 {
 	unsigned int changes = dev->flags ^ old_flags;
 
 	if (gchanges)
-		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
+		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
 
 	if (changes & IFF_UP) {
 		if (dev->flags & IFF_UP)
@@ -8616,7 +8617,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
 		return ret;
 
 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
-	__dev_notify_flags(dev, old_flags, changes);
+	__dev_notify_flags(dev, old_flags, changes, 0, NULL);
 	return ret;
 }
 EXPORT_SYMBOL(dev_change_flags);
@@ -10101,7 +10102,7 @@ int register_netdevice(struct net_device *dev)
 	 */
 	if (!dev->rtnl_link_ops ||
 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
-		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
 
 out:
 	return ret;
@@ -10849,7 +10850,7 @@ void unregister_netdevice_many(struct list_head *head)
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
-						     GFP_KERNEL, NULL, 0);
+						     GFP_KERNEL, NULL, 0, 0, 0);
 
 		/*
 		 *	Flush the unicast and multicast chains
@@ -10864,7 +10865,7 @@ void unregister_netdevice_many(struct list_head *head)
 			dev->netdev_ops->ndo_uninit(dev);
 
 		if (skb)
-			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
+			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, 0, NULL);
 
 		/* Notifier chain MUST detach us all upper devices. */
 		WARN_ON(netdev_has_any_upper_dev(dev));
@@ -11042,7 +11043,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
 	 *	Prevent userspace races by waiting until the network
 	 *	device is fully setup before sending notifications.
 	 */
-	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
 
 	synchronize_net();
 	err = 0;
diff --git a/net/core/dev.h b/net/core/dev.h
index cbb8a925175a..6b3c7302f570 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -88,6 +88,10 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier);
 
 void __dev_set_rx_mode(struct net_device *dev);
 
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
+			unsigned int gchanges, u32 portid,
+			const struct nlmsghdr *nlh);
+
 static inline void netif_set_gso_max_size(struct net_device *dev,
 					  unsigned int size)
 {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 74864dc46a7e..c9dd9730f3c6 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -760,7 +760,7 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
 EXPORT_SYMBOL(rtnl_unicast);
 
 void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
-		 struct nlmsghdr *nlh, gfp_t flags)
+		 const struct nlmsghdr *nlh, gfp_t flags)
 {
 	struct sock *rtnl = net->rtnl;
 
@@ -3180,7 +3180,8 @@ out:
 	return err;
 }
 
-int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
+			u32 portid, const struct nlmsghdr *nlh)
 {
 	unsigned int old_flags;
 	int err;
@@ -3194,10 +3195,10 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
 	}
 
 	if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
-		__dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags));
+		__dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh);
 	} else {
 		dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
-		__dev_notify_flags(dev, old_flags, ~0U);
+		__dev_notify_flags(dev, old_flags, ~0U, portid, nlh);
 	}
 	return 0;
 }
@@ -3369,7 +3370,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
 		goto out;
 	}
 
-	err = rtnl_configure_link(dev, ifm);
+	err = rtnl_configure_link(dev, ifm, 0, NULL);
 	if (err < 0)
 		goto out_unregister;
 	if (link_net) {
@@ -3896,7 +3897,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned int change,
 				       u32 event, gfp_t flags, int *new_nsid,
-				       int new_ifindex)
+				       int new_ifindex, u32 portid, u32 seq)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -3907,7 +3908,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 		goto errout;
 
 	err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
-			       type, 0, 0, change, 0, 0, event,
+			       type, portid, seq, change, 0, 0, event,
 			       new_nsid, new_ifindex, -1, flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
@@ -3922,16 +3923,18 @@ errout:
 	return NULL;
 }
 
-void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags,
+		       u32 portid, const struct nlmsghdr *nlh)
 {
 	struct net *net = dev_net(dev);
 
-	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
+	rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags);
 }
 
 static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 			       unsigned int change, u32 event,
-			       gfp_t flags, int *new_nsid, int new_ifindex)
+			       gfp_t flags, int *new_nsid, int new_ifindex,
+			       u32 portid, const struct nlmsghdr *nlh)
 {
 	struct sk_buff *skb;
 
@@ -3939,23 +3942,23 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 		return;
 
 	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
-				     new_ifindex);
+				     new_ifindex, portid, nlmsg_seq(nlh));
 	if (skb)
-		rtmsg_ifinfo_send(skb, dev, flags);
+		rtmsg_ifinfo_send(skb, dev, flags, portid, nlh);
 }
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
-		  gfp_t flags)
+		  gfp_t flags, u32 portid, const struct nlmsghdr *nlh)
 {
 	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
-			   NULL, 0);
+			   NULL, 0, portid, nlh);
 }
 
 void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
 			 gfp_t flags, int *new_nsid, int new_ifindex)
 {
 	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
-			   new_nsid, new_ifindex);
+			   new_nsid, new_ifindex, 0, NULL);
 }
 
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
@@ -6140,7 +6143,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_CHANGELOWERSTATE:
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
 		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
-				   GFP_KERNEL, NULL, 0);
+				   GFP_KERNEL, NULL, 0, 0, NULL);
 		break;
 	default:
 		break;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f866d6282b2b..d8ee5238c395 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1665,7 +1665,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
 	if (err)
 		goto out;
 
-	err = rtnl_configure_link(dev, NULL);
+	err = rtnl_configure_link(dev, NULL, 0, NULL);
 	if (err < 0)
 		goto out;
 
-- 
cgit v1.2.3


From d08b0f8f46e45a274fc8c9a5bc92cb9da70d9887 Mon Sep 17 00:00:00 2001
From: Shane Parslow <shaneparslow808@gmail.com>
Date: Sat, 29 Oct 2022 02:03:56 -0700
Subject: net: wwan: iosm: add rpc interface for xmm modems

Add a new iosm wwan port that connects to the modem rpc interface. This
interface provides a configuration channel, and in the case of the 7360, is
the only way to configure the modem (as it does not support mbim).

The new interface is compatible with existing software, such as
open_xdatachannel.py from the xmm7360-pci project [1].

[1] https://github.com/xmm7360/xmm7360-pci

Signed-off-by: Shane Parslow <shaneparslow808@gmail.com>
Reviewed-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wwan/iosm/iosm_ipc_chnl_cfg.c | 2 +-
 drivers/net/wwan/wwan_core.c              | 4 ++++
 include/linux/wwan.h                      | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/wwan/iosm/iosm_ipc_chnl_cfg.c b/drivers/net/wwan/iosm/iosm_ipc_chnl_cfg.c
index 128c999e08bb..bcfbc6b3d617 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_chnl_cfg.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_chnl_cfg.c
@@ -39,7 +39,7 @@ static struct ipc_chnl_cfg modem_cfg[] = {
 	/* RPC - 0 */
 	{ IPC_MEM_CTRL_CHL_ID_1, IPC_MEM_PIPE_2, IPC_MEM_PIPE_3,
 	  IPC_MEM_MAX_TDS_RPC, IPC_MEM_MAX_TDS_RPC,
-	  IPC_MEM_MAX_DL_RPC_BUF_SIZE, WWAN_PORT_UNKNOWN },
+	  IPC_MEM_MAX_DL_RPC_BUF_SIZE, WWAN_PORT_XMMRPC },
 	/* IAT0 */
 	{ IPC_MEM_CTRL_CHL_ID_2, IPC_MEM_PIPE_4, IPC_MEM_PIPE_5,
 	  IPC_MEM_MAX_TDS_AT, IPC_MEM_MAX_TDS_AT, IPC_MEM_MAX_DL_AT_BUF_SIZE,
diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index d72ee18476d1..966d0ccd2276 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -319,6 +319,10 @@ static const struct {
 		.name = "FIREHOSE",
 		.devsuf = "firehose",
 	},
+	[WWAN_PORT_XMMRPC] = {
+		.name = "XMMRPC",
+		.devsuf = "xmmrpc",
+	},
 };
 
 static ssize_t type_show(struct device *dev, struct device_attribute *attr,
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index 5ce2acf444fb..24d76500b1cc 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -15,6 +15,7 @@
  * @WWAN_PORT_QMI: Qcom modem/MSM interface for modem control
  * @WWAN_PORT_QCDM: Qcom Modem diagnostic interface
  * @WWAN_PORT_FIREHOSE: XML based command protocol
+ * @WWAN_PORT_XMMRPC: Control protocol for Intel XMM modems
  *
  * @WWAN_PORT_MAX: Highest supported port types
  * @WWAN_PORT_UNKNOWN: Special value to indicate an unknown port type
@@ -26,6 +27,7 @@ enum wwan_port_type {
 	WWAN_PORT_QMI,
 	WWAN_PORT_QCDM,
 	WWAN_PORT_FIREHOSE,
+	WWAN_PORT_XMMRPC,
 
 	/* Add new port types above this line */
 
-- 
cgit v1.2.3


From 23da464dd6b8935b66f4ee306ad8947fd32ccd75 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 4 Nov 2022 00:39:51 +0530
Subject: bpf: Allow specifying volatile type modifier for kptrs

This is useful in particular to mark the pointer as volatile, so that
compiler treats each load and store to the field as a volatile access.
The alternative is having to define and use READ_ONCE and WRITE_ONCE in
the BPF program.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20221103191013.1236066-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h | 5 +++++
 kernel/bpf/btf.c    | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index f9aababc5d78..86aad9b2ce02 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -288,6 +288,11 @@ static inline bool btf_type_is_typedef(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF;
 }
 
+static inline bool btf_type_is_volatile(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_VOLATILE;
+}
+
 static inline bool btf_type_is_func(const struct btf_type *t)
 {
 	return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 35c07afac924..f4d21eef6ebd 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3225,6 +3225,9 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 	enum bpf_kptr_type type;
 	u32 res_id;
 
+	/* Permit modifiers on the pointer itself */
+	if (btf_type_is_volatile(t))
+		t = btf_type_by_id(btf, t->type);
 	/* For PTR, sz is always == 8 */
 	if (!btf_type_is_ptr(t))
 		return BTF_FIELD_IGNORE;
-- 
cgit v1.2.3


From a35ec8e38cdd1766f29924ca391a01de20163931 Mon Sep 17 00:00:00 2001
From: "Hans J. Schultz" <netdev@kapio-technology.com>
Date: Tue, 1 Nov 2022 21:39:21 +0200
Subject: bridge: Add MAC Authentication Bypass (MAB) support

Hosts that support 802.1X authentication are able to authenticate
themselves by exchanging EAPOL frames with an authenticator (Ethernet
bridge, in this case) and an authentication server. Access to the
network is only granted by the authenticator to successfully
authenticated hosts.

The above is implemented in the bridge using the "locked" bridge port
option. When enabled, link-local frames (e.g., EAPOL) can be locally
received by the bridge, but all other frames are dropped unless the host
is authenticated. That is, unless the user space control plane installed
an FDB entry according to which the source address of the frame is
located behind the locked ingress port. The entry can be dynamic, in
which case learning needs to be enabled so that the entry will be
refreshed by incoming traffic.

There are deployments in which not all the devices connected to the
authenticator (the bridge) support 802.1X. Such devices can include
printers and cameras. One option to support such deployments is to
unlock the bridge ports connecting these devices, but a slightly more
secure option is to use MAB. When MAB is enabled, the MAC address of the
connected device is used as the user name and password for the
authentication.

For MAB to work, the user space control plane needs to be notified about
MAC addresses that are trying to gain access so that they will be
compared against an allow list. This can be implemented via the regular
learning process with the sole difference that learned FDB entries are
installed with a new "locked" flag indicating that the entry cannot be
used to authenticate the device. The flag cannot be set by user space,
but user space can clear the flag by replacing the entry, thereby
authenticating the device.

Locked FDB entries implement the following semantics with regards to
roaming, aging and forwarding:

1. Roaming: Locked FDB entries can roam to unlocked (authorized) ports,
   in which case the "locked" flag is cleared. FDB entries cannot roam
   to locked ports regardless of MAB being enabled or not. Therefore,
   locked FDB entries are only created if an FDB entry with the given {MAC,
   VID} does not already exist. This behavior prevents unauthenticated
   devices from disrupting traffic destined to already authenticated
   devices.

2. Aging: Locked FDB entries age and refresh by incoming traffic like
   regular entries.

3. Forwarding: Locked FDB entries forward traffic like regular entries.
   If user space detects an unauthorized MAC behind a locked port and
   wishes to prevent traffic with this MAC DA from reaching the host, it
   can do so using tc or a different mechanism.

Enable the above behavior using a new bridge port option called "mab".
It can only be enabled on a bridge port that is both locked and has
learning enabled. Locked FDB entries are flushed from the port once MAB
is disabled. A new option is added because there are pure 802.1X
deployments that are not interested in notifications about locked FDB
entries.

Signed-off-by: Hans J. Schultz <netdev@kapio-technology.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/if_bridge.h      |  1 +
 include/uapi/linux/if_link.h   |  1 +
 include/uapi/linux/neighbour.h |  8 +++++++-
 net/bridge/br_fdb.c            | 24 ++++++++++++++++++++++++
 net/bridge/br_input.c          | 21 +++++++++++++++++++--
 net/bridge/br_netlink.c        | 21 ++++++++++++++++++++-
 net/bridge/br_private.h        |  3 ++-
 net/core/rtnetlink.c           |  5 +++++
 8 files changed, 79 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index d62ef428e3aa..1668ac4d7adc 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -59,6 +59,7 @@ struct br_ip_list {
 #define BR_MRP_LOST_IN_CONT	BIT(19)
 #define BR_TX_FWD_OFFLOAD	BIT(20)
 #define BR_PORT_LOCKED		BIT(21)
+#define BR_PORT_MAB		BIT(22)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 5e7a1041df3a..d92b3f79eba3 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -561,6 +561,7 @@ enum {
 	IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT,
 	IFLA_BRPORT_MCAST_EHT_HOSTS_CNT,
 	IFLA_BRPORT_LOCKED,
+	IFLA_BRPORT_MAB,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index a998bf761635..5e67a7eaf4a7 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -52,7 +52,8 @@ enum {
 #define NTF_STICKY	(1 << 6)
 #define NTF_ROUTER	(1 << 7)
 /* Extended flags under NDA_FLAGS_EXT: */
-#define NTF_EXT_MANAGED	(1 << 0)
+#define NTF_EXT_MANAGED		(1 << 0)
+#define NTF_EXT_LOCKED		(1 << 1)
 
 /*
  *	Neighbor Cache Entry States.
@@ -86,6 +87,11 @@ enum {
  * NTF_EXT_MANAGED flagged neigbor entries are managed by the kernel on behalf
  * of a user space control plane, and automatically refreshed so that (if
  * possible) they remain in NUD_REACHABLE state.
+ *
+ * NTF_EXT_LOCKED flagged bridge FDB entries are entries generated by the
+ * bridge in response to a host trying to communicate via a locked bridge port
+ * with MAB enabled. Their purpose is to notify user space that a host requires
+ * authentication.
  */
 
 struct nda_cacheinfo {
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e7f4fccb6adb..3b83af4458b8 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -105,6 +105,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 	struct nda_cacheinfo ci;
 	struct nlmsghdr *nlh;
 	struct ndmsg *ndm;
+	u32 ext_flags = 0;
 
 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
 	if (nlh == NULL)
@@ -125,11 +126,16 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 		ndm->ndm_flags |= NTF_EXT_LEARNED;
 	if (test_bit(BR_FDB_STICKY, &fdb->flags))
 		ndm->ndm_flags |= NTF_STICKY;
+	if (test_bit(BR_FDB_LOCKED, &fdb->flags))
+		ext_flags |= NTF_EXT_LOCKED;
 
 	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
 		goto nla_put_failure;
 	if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
 		goto nla_put_failure;
+	if (nla_put_u32(skb, NDA_FLAGS_EXT, ext_flags))
+		goto nla_put_failure;
+
 	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
 	ci.ndm_confirmed = 0;
 	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
@@ -171,6 +177,7 @@ static inline size_t fdb_nlmsg_size(void)
 	return NLMSG_ALIGN(sizeof(struct ndmsg))
 		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
 		+ nla_total_size(sizeof(u32)) /* NDA_MASTER */
+		+ nla_total_size(sizeof(u32)) /* NDA_FLAGS_EXT */
 		+ nla_total_size(sizeof(u16)) /* NDA_VLAN */
 		+ nla_total_size(sizeof(struct nda_cacheinfo))
 		+ nla_total_size(0) /* NDA_FDB_EXT_ATTRS */
@@ -879,6 +886,11 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 						      &fdb->flags)))
 					clear_bit(BR_FDB_ADDED_BY_EXT_LEARN,
 						  &fdb->flags);
+				/* Clear locked flag when roaming to an
+				 * unlocked port.
+				 */
+				if (unlikely(test_bit(BR_FDB_LOCKED, &fdb->flags)))
+					clear_bit(BR_FDB_LOCKED, &fdb->flags);
 			}
 
 			if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags)))
@@ -1082,6 +1094,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 		modified = true;
 	}
 
+	if (test_and_clear_bit(BR_FDB_LOCKED, &fdb->flags))
+		modified = true;
+
 	if (fdb_handle_notify(fdb, notify))
 		modified = true;
 
@@ -1150,6 +1165,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	struct net_bridge_port *p = NULL;
 	struct net_bridge_vlan *v;
 	struct net_bridge *br = NULL;
+	u32 ext_flags = 0;
 	int err = 0;
 
 	trace_br_fdb_add(ndm, dev, addr, vid, nlh_flags);
@@ -1178,6 +1194,14 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 		vg = nbp_vlan_group(p);
 	}
 
+	if (tb[NDA_FLAGS_EXT])
+		ext_flags = nla_get_u32(tb[NDA_FLAGS_EXT]);
+
+	if (ext_flags & NTF_EXT_LOCKED) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot add FDB entry with \"locked\" flag set");
+		return -EINVAL;
+	}
+
 	if (tb[NDA_FDB_EXT_ATTRS]) {
 		attr = tb[NDA_FDB_EXT_ATTRS];
 		err = nla_parse_nested(nfea_tb, NFEA_MAX, attr,
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 68b3e850bcb9..d04d2205ad4e 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -109,9 +109,26 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 		struct net_bridge_fdb_entry *fdb_src =
 			br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid);
 
-		if (!fdb_src || READ_ONCE(fdb_src->dst) != p ||
-		    test_bit(BR_FDB_LOCAL, &fdb_src->flags))
+		if (!fdb_src) {
+			/* FDB miss. Create locked FDB entry if MAB is enabled
+			 * and drop the packet.
+			 */
+			if (p->flags & BR_PORT_MAB)
+				br_fdb_update(br, p, eth_hdr(skb)->h_source,
+					      vid, BIT(BR_FDB_LOCKED));
 			goto drop;
+		} else if (READ_ONCE(fdb_src->dst) != p ||
+			   test_bit(BR_FDB_LOCAL, &fdb_src->flags)) {
+			/* FDB mismatch. Drop the packet without roaming. */
+			goto drop;
+		} else if test_bit(BR_FDB_LOCKED, &fdb_src->flags) {
+			/* FDB match, but entry is locked. Refresh it and drop
+			 * the packet.
+			 */
+			br_fdb_update(br, p, eth_hdr(skb)->h_source, vid,
+				      BIT(BR_FDB_LOCKED));
+			goto drop;
+		}
 	}
 
 	nbp_switchdev_frame_mark(p, skb);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index d087fd4c784a..4316cc82ae17 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -188,6 +188,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_SUPPRESS */
 		+ nla_total_size(1)	/* IFLA_BRPORT_ISOLATED */
 		+ nla_total_size(1)	/* IFLA_BRPORT_LOCKED */
+		+ nla_total_size(1)	/* IFLA_BRPORT_MAB */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -274,7 +275,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN,
 		       !!(p->flags & BR_MRP_LOST_IN_CONT)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)))
+	    nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -876,6 +878,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
 	[IFLA_BRPORT_ISOLATED]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_LOCKED] = { .type = NLA_U8 },
+	[IFLA_BRPORT_MAB] = { .type = NLA_U8 },
 	[IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
 	[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 },
 };
@@ -943,6 +946,22 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
 	br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS);
 	br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
 	br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED);
+	br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB);
+
+	if ((p->flags & BR_PORT_MAB) &&
+	    (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) {
+		NL_SET_ERR_MSG(extack, "Bridge port must be locked and have learning enabled when MAB is enabled");
+		p->flags = old_flags;
+		return -EINVAL;
+	} else if (!(p->flags & BR_PORT_MAB) && (old_flags & BR_PORT_MAB)) {
+		struct net_bridge_fdb_flush_desc desc = {
+			.flags = BIT(BR_FDB_LOCKED),
+			.flags_mask = BIT(BR_FDB_LOCKED),
+			.port_ifindex = p->dev->ifindex,
+		};
+
+		br_fdb_flush(p->br, &desc);
+	}
 
 	changed_mask = old_flags ^ p->flags;
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 06e5f6faa431..4ce8b8e5ae0b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -251,7 +251,8 @@ enum {
 	BR_FDB_ADDED_BY_EXT_LEARN,
 	BR_FDB_OFFLOADED,
 	BR_FDB_NOTIFY,
-	BR_FDB_NOTIFY_INACTIVE
+	BR_FDB_NOTIFY_INACTIVE,
+	BR_FDB_LOCKED,
 };
 
 struct net_bridge_fdb_key {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d2f27548fc0b..b64fffeb3844 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4051,6 +4051,11 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
 		return err;
 	}
 
+	if (tb[NDA_FLAGS_EXT]) {
+		netdev_info(dev, "invalid flags given to default FDB implementation\n");
+		return err;
+	}
+
 	if (vid) {
 		netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n");
 		return err;
-- 
cgit v1.2.3


From 02a68a47eadedf95748facfca6ced31fb0181d52 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 2 Nov 2022 17:02:03 +0100
Subject: net: devlink: track netdev with devlink_port assigned

Currently, ethernet drivers are using devlink_port_type_eth_set() and
devlink_port_type_clear() to set devlink port type and link to related
netdev.

Instead of calling them directly, let the driver use
SET_NETDEV_DEVLINK_PORT macro to assign devlink_port pointer and let
devlink to track it. Note the devlink port pointer is static during
the time netdevice is registered.

In devlink code, use per-namespace netdev notifier to track
the netdevices with devlink_port assigned and change the internal
devlink_port type and related type pointer accordingly.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 19 ++++++++++++
 net/core/dev.c            | 14 +++++----
 net/core/devlink.c        | 75 ++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 99 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4b5052db978f..f048a30ea10b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1999,6 +1999,11 @@ enum netdev_ml_priv_type {
  *					registered
  *	@offload_xstats_l3:	L3 HW stats for this netdevice.
  *
+ *	@devlink_port:	Pointer to related devlink port structure.
+ *			Assigned by a driver before netdev registration using
+ *			SET_NETDEV_DEVLINK_PORT macro. This pointer is static
+ *			during the time netdevice is registered.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2349,9 +2354,22 @@ struct net_device {
 	netdevice_tracker	watchdog_dev_tracker;
 	netdevice_tracker	dev_registered_tracker;
 	struct rtnl_hw_stats64	*offload_xstats_l3;
+
+	struct devlink_port	*devlink_port;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
+/*
+ * Driver should use this to assign devlink port instance to a netdevice
+ * before it registers the netdevice. Therefore devlink_port is static
+ * during the netdev lifetime after it is registered.
+ */
+#define SET_NETDEV_DEVLINK_PORT(dev, port)			\
+({								\
+	WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED);	\
+	((dev)->devlink_port = (port));				\
+})
+
 static inline bool netif_elide_gro(const struct net_device *dev)
 {
 	if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
@@ -2785,6 +2803,7 @@ enum netdev_cmd {
 	NETDEV_PRE_TYPE_CHANGE,
 	NETDEV_POST_TYPE_CHANGE,
 	NETDEV_POST_INIT,
+	NETDEV_PRE_UNINIT,
 	NETDEV_RELEASE,
 	NETDEV_NOTIFY_PEERS,
 	NETDEV_JOIN,
diff --git a/net/core/dev.c b/net/core/dev.c
index 2e4f1c97b59e..3bacee3bee78 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1621,10 +1621,10 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
-	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
-	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
-	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
-	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
+	N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
+	N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
+	N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
+	N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
@@ -10060,7 +10060,7 @@ int register_netdevice(struct net_device *dev)
 	dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
 	write_unlock(&dev_base_lock);
 	if (ret)
-		goto err_uninit;
+		goto err_uninit_notify;
 
 	__netdev_update_features(dev);
 
@@ -10107,6 +10107,8 @@ int register_netdevice(struct net_device *dev)
 out:
 	return ret;
 
+err_uninit_notify:
+	call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
 err_uninit:
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
@@ -10856,6 +10858,8 @@ void unregister_netdevice_many_notify(struct list_head *head,
 		netdev_name_node_alt_flush(dev);
 		netdev_name_node_free(dev->name_node);
 
+		call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
+
 		if (dev->netdev_ops->ndo_uninit)
 			dev->netdev_ops->ndo_uninit(dev);
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 3387dfbb80c5..6f06c05c7b1a 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -71,6 +71,7 @@ struct devlink {
 	refcount_t refcount;
 	struct completion comp;
 	struct rcu_head rcu;
+	struct notifier_block netdevice_nb;
 	char priv[] __aligned(NETDEV_ALIGN);
 };
 
@@ -9615,6 +9616,9 @@ void devlink_set_features(struct devlink *devlink, u64 features)
 }
 EXPORT_SYMBOL_GPL(devlink_set_features);
 
+static int devlink_netdevice_event(struct notifier_block *nb,
+				   unsigned long event, void *ptr);
+
 /**
  *	devlink_alloc_ns - Allocate new devlink instance resources
  *	in specific namespace
@@ -9645,10 +9649,13 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
 
 	ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b,
 			      &last_id, GFP_KERNEL);
-	if (ret < 0) {
-		kfree(devlink);
-		return NULL;
-	}
+	if (ret < 0)
+		goto err_xa_alloc;
+
+	devlink->netdevice_nb.notifier_call = devlink_netdevice_event;
+	ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb);
+	if (ret)
+		goto err_register_netdevice_notifier;
 
 	devlink->dev = dev;
 	devlink->ops = ops;
@@ -9675,6 +9682,12 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
 	init_completion(&devlink->comp);
 
 	return devlink;
+
+err_register_netdevice_notifier:
+	xa_erase(&devlinks, devlink->index);
+err_xa_alloc:
+	kfree(devlink);
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(devlink_alloc_ns);
 
@@ -9828,6 +9841,10 @@ void devlink_free(struct devlink *devlink)
 	WARN_ON(!list_empty(&devlink->port_list));
 
 	xa_destroy(&devlink->snapshot_ids);
+
+	unregister_netdevice_notifier_net(devlink_net(devlink),
+					  &devlink->netdevice_nb);
+
 	xa_erase(&devlinks, devlink->index);
 
 	kfree(devlink);
@@ -10121,6 +10138,56 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)
 }
 EXPORT_SYMBOL_GPL(devlink_port_type_clear);
 
+static int devlink_netdevice_event(struct notifier_block *nb,
+				   unsigned long event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct devlink_port *devlink_port = netdev->devlink_port;
+	struct devlink *devlink;
+
+	devlink = container_of(nb, struct devlink, netdevice_nb);
+
+	if (!devlink_port || devlink_port->devlink != devlink)
+		return NOTIFY_OK;
+
+	switch (event) {
+	case NETDEV_POST_INIT:
+		/* Set the type but not netdev pointer. It is going to be set
+		 * later on by NETDEV_REGISTER event. Happens once during
+		 * netdevice register
+		 */
+		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
+					NULL, true);
+		break;
+	case NETDEV_REGISTER:
+		/* Set the netdev on top of previously set type. Note this
+		 * event happens also during net namespace change so here
+		 * we take into account netdev pointer appearing in this
+		 * namespace.
+		 */
+		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
+					netdev, true);
+		break;
+	case NETDEV_UNREGISTER:
+		/* Clear netdev pointer, but not the type. This event happens
+		 * also during net namespace change so we need to clear
+		 * pointer to netdev that is going to another net namespace.
+		 */
+		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
+					NULL, true);
+		break;
+	case NETDEV_PRE_UNINIT:
+		/* Clear the type and the netdev pointer. Happens one during
+		 * netdevice unregister.
+		 */
+		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET,
+					NULL, true);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
 static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
 				    enum devlink_port_flavour flavour)
 {
-- 
cgit v1.2.3


From 77df1db80da384c565106321f5934967690da7dd Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 2 Nov 2022 17:02:10 +0100
Subject: net: remove unused ndo_get_devlink_port

Remove ndo_get_devlink_port which is no longer used alongside with the
implementations in drivers.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c                |  8 --------
 drivers/net/ethernet/fungible/funeth/funeth_main.c       |  8 --------
 drivers/net/ethernet/intel/ice/ice_main.c                | 15 ---------------
 drivers/net/ethernet/intel/ice/ice_repr.c                |  9 ---------
 drivers/net/ethernet/marvell/prestera/prestera_devlink.c |  7 -------
 drivers/net/ethernet/marvell/prestera/prestera_devlink.h |  2 --
 drivers/net/ethernet/marvell/prestera/prestera_main.c    |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c     | 10 ----------
 drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h     |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c        |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c         | 10 ----------
 drivers/net/ethernet/mellanox/mlxsw/minimal.c            | 11 -----------
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c           | 11 -----------
 drivers/net/ethernet/mscc/ocelot_net.c                   | 10 ----------
 drivers/net/ethernet/netronome/nfp/nfp_app.h             |  2 --
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c         | 11 -----------
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c      |  2 --
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c        |  1 -
 drivers/net/ethernet/ti/am65-cpsw-nuss.c                 |  8 --------
 drivers/net/netdevsim/netdev.c                           |  9 ---------
 include/linux/netdevice.h                                |  5 -----
 net/dsa/slave.c                                          |  8 --------
 22 files changed, 150 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 9604294b67aa..03272aa48511 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -13073,13 +13073,6 @@ int bnxt_get_port_parent_id(struct net_device *dev,
 	return 0;
 }
 
-static struct devlink_port *bnxt_get_devlink_port(struct net_device *dev)
-{
-	struct bnxt *bp = netdev_priv(dev);
-
-	return &bp->dl_port;
-}
-
 static const struct net_device_ops bnxt_netdev_ops = {
 	.ndo_open		= bnxt_open,
 	.ndo_start_xmit		= bnxt_start_xmit,
@@ -13111,7 +13104,6 @@ static const struct net_device_ops bnxt_netdev_ops = {
 	.ndo_xdp_xmit		= bnxt_xdp_xmit,
 	.ndo_bridge_getlink	= bnxt_bridge_getlink,
 	.ndo_bridge_setlink	= bnxt_bridge_setlink,
-	.ndo_get_devlink_port	= bnxt_get_devlink_port,
 };
 
 static void bnxt_remove_one(struct pci_dev *pdev)
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_main.c b/drivers/net/ethernet/fungible/funeth/funeth_main.c
index 208dc89f4972..b4cce30e526a 100644
--- a/drivers/net/ethernet/fungible/funeth/funeth_main.c
+++ b/drivers/net/ethernet/fungible/funeth/funeth_main.c
@@ -1178,13 +1178,6 @@ static int fun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
-static struct devlink_port *fun_get_devlink_port(struct net_device *netdev)
-{
-	struct funeth_priv *fp = netdev_priv(netdev);
-
-	return &fp->dl_port;
-}
-
 static int fun_init_vports(struct fun_ethdev *ed, unsigned int n)
 {
 	if (ed->num_vports)
@@ -1350,7 +1343,6 @@ static const struct net_device_ops fun_netdev_ops = {
 	.ndo_set_vf_vlan	= fun_set_vf_vlan,
 	.ndo_set_vf_rate	= fun_set_vf_rate,
 	.ndo_get_vf_config	= fun_get_vf_config,
-	.ndo_get_devlink_port	= fun_get_devlink_port,
 };
 
 #define GSO_ENCAP_FLAGS (NETIF_F_GSO_GRE | NETIF_F_GSO_IPXIP4 | \
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 74d25fda11bd..a9fc89aebebe 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -298,20 +298,6 @@ static int ice_clear_promisc(struct ice_vsi *vsi, u8 promisc_m)
 	return status;
 }
 
-/**
- * ice_get_devlink_port - Get devlink port from netdev
- * @netdev: the netdevice structure
- */
-static struct devlink_port *ice_get_devlink_port(struct net_device *netdev)
-{
-	struct ice_pf *pf = ice_netdev_to_pf(netdev);
-
-	if (!ice_is_switchdev_running(pf))
-		return NULL;
-
-	return &pf->devlink_port;
-}
-
 /**
  * ice_vsi_sync_fltr - Update the VSI filter list to the HW
  * @vsi: ptr to the VSI
@@ -9107,5 +9093,4 @@ static const struct net_device_ops ice_netdev_ops = {
 	.ndo_bpf = ice_xdp,
 	.ndo_xdp_xmit = ice_xdp_xmit,
 	.ndo_xsk_wakeup = ice_xsk_wakeup,
-	.ndo_get_devlink_port = ice_get_devlink_port,
 };
diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c
index 663a7a0e1814..0483eb14c288 100644
--- a/drivers/net/ethernet/intel/ice/ice_repr.c
+++ b/drivers/net/ethernet/intel/ice/ice_repr.c
@@ -134,14 +134,6 @@ static int ice_repr_stop(struct net_device *netdev)
 	return 0;
 }
 
-static struct devlink_port *
-ice_repr_get_devlink_port(struct net_device *netdev)
-{
-	struct ice_repr *repr = ice_netdev_to_repr(netdev);
-
-	return &repr->vf->devlink_port;
-}
-
 /**
  * ice_repr_sp_stats64 - get slow path stats for port representor
  * @dev: network interface device structure
@@ -250,7 +242,6 @@ static const struct net_device_ops ice_repr_netdev_ops = {
 	.ndo_open = ice_repr_open,
 	.ndo_stop = ice_repr_stop,
 	.ndo_start_xmit = ice_eswitch_port_start_xmit,
-	.ndo_get_devlink_port = ice_repr_get_devlink_port,
 	.ndo_setup_tc = ice_repr_setup_tc,
 	.ndo_has_offload_stats = ice_repr_ndo_has_offload_stats,
 	.ndo_get_offload_stats = ice_repr_ndo_get_offload_stats,
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c
index 637b8fee65e7..84ad05c9f12d 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c
@@ -445,13 +445,6 @@ void prestera_devlink_port_unregister(struct prestera_port *port)
 	devlink_port_unregister(&port->dl_port);
 }
 
-struct devlink_port *prestera_devlink_get_port(struct net_device *dev)
-{
-	struct prestera_port *port = netdev_priv(dev);
-
-	return &port->dl_port;
-}
-
 int prestera_devlink_traps_register(struct prestera_switch *sw)
 {
 	const u32 groups_count = ARRAY_SIZE(prestera_trap_groups_arr);
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_devlink.h b/drivers/net/ethernet/marvell/prestera/prestera_devlink.h
index 04e8556f748a..bf84ad6fd87e 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_devlink.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera_devlink.h
@@ -15,8 +15,6 @@ void prestera_devlink_unregister(struct prestera_switch *sw);
 int prestera_devlink_port_register(struct prestera_port *port);
 void prestera_devlink_port_unregister(struct prestera_port *port);
 
-struct devlink_port *prestera_devlink_get_port(struct net_device *dev);
-
 void prestera_devlink_trap_report(struct prestera_port *port,
 				  struct sk_buff *skb, u8 cpu_code);
 int prestera_devlink_traps_register(struct prestera_switch *sw);
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c
index a76bf3606aa6..edbdda8f958d 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_main.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c
@@ -569,7 +569,6 @@ static const struct net_device_ops prestera_netdev_ops = {
 	.ndo_change_mtu = prestera_port_change_mtu,
 	.ndo_get_stats64 = prestera_port_get_stats64,
 	.ndo_set_mac_address = prestera_port_set_mac_address,
-	.ndo_get_devlink_port = prestera_devlink_get_port,
 };
 
 int prestera_port_autoneg_set(struct prestera_port *port, u64 link_modes)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
index ce0e56f856d6..83adaabf59f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
@@ -62,13 +62,3 @@ void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv)
 	if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW))
 		devl_unlock(devlink);
 }
-
-struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev)
-{
-	struct mlx5e_priv *priv = netdev_priv(dev);
-
-	if (!netif_device_present(dev))
-		return NULL;
-
-	return mlx5e_devlink_get_dl_port(priv);
-}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h
index 1c203257ac30..4f238d4fff55 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h
@@ -9,7 +9,6 @@
 
 int mlx5e_devlink_port_register(struct mlx5e_priv *priv);
 void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv);
-struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev);
 
 static inline struct devlink_port *
 mlx5e_devlink_get_dl_port(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 97e876472a06..78d34c039fa7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4897,7 +4897,6 @@ const struct net_device_ops mlx5e_netdev_ops = {
 	.ndo_has_offload_stats   = mlx5e_has_offload_stats,
 	.ndo_get_offload_stats   = mlx5e_get_offload_stats,
 #endif
-	.ndo_get_devlink_port    = mlx5e_get_devlink_port,
 };
 
 static u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 5412633b9c8c..1b53e8852c86 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -607,15 +607,6 @@ static int mlx5e_rep_change_mtu(struct net_device *netdev, int new_mtu)
 	return mlx5e_change_mtu(netdev, new_mtu, NULL);
 }
 
-static struct devlink_port *mlx5e_rep_get_devlink_port(struct net_device *netdev)
-{
-	struct mlx5e_priv *priv = netdev_priv(netdev);
-	struct mlx5e_rep_priv *rpriv = priv->ppriv;
-	struct mlx5_core_dev *dev = priv->mdev;
-
-	return mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport);
-}
-
 static int mlx5e_rep_change_carrier(struct net_device *dev, bool new_carrier)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
@@ -644,7 +635,6 @@ static const struct net_device_ops mlx5e_netdev_ops_rep = {
 	.ndo_stop                = mlx5e_rep_close,
 	.ndo_start_xmit          = mlx5e_xmit,
 	.ndo_setup_tc            = mlx5e_rep_setup_tc,
-	.ndo_get_devlink_port    = mlx5e_rep_get_devlink_port,
 	.ndo_get_stats64         = mlx5e_rep_get_stats,
 	.ndo_has_offload_stats	 = mlx5e_rep_has_offload_stats,
 	.ndo_get_offload_stats	 = mlx5e_rep_get_offload_stats,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/minimal.c b/drivers/net/ethernet/mellanox/mlxsw/minimal.c
index 177cf7e4db34..6b56eadd736e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/minimal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/minimal.c
@@ -81,20 +81,9 @@ static int mlxsw_m_port_stop(struct net_device *dev)
 	return 0;
 }
 
-static struct devlink_port *
-mlxsw_m_port_get_devlink_port(struct net_device *dev)
-{
-	struct mlxsw_m_port *mlxsw_m_port = netdev_priv(dev);
-	struct mlxsw_m *mlxsw_m = mlxsw_m_port->mlxsw_m;
-
-	return mlxsw_core_port_devlink_port_get(mlxsw_m->core,
-						mlxsw_m_port->local_port);
-}
-
 static const struct net_device_ops mlxsw_m_port_netdev_ops = {
 	.ndo_open		= mlxsw_m_port_open,
 	.ndo_stop		= mlxsw_m_port_stop,
-	.ndo_get_devlink_port	= mlxsw_m_port_get_devlink_port,
 };
 
 static void mlxsw_m_module_get_drvinfo(struct net_device *dev,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index b13739e1f8f3..04dc79da6024 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1259,16 +1259,6 @@ static int mlxsw_sp_set_features(struct net_device *dev,
 	return 0;
 }
 
-static struct devlink_port *
-mlxsw_sp_port_get_devlink_port(struct net_device *dev)
-{
-	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
-	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
-
-	return mlxsw_core_port_devlink_port_get(mlxsw_sp->core,
-						mlxsw_sp_port->local_port);
-}
-
 static int mlxsw_sp_port_hwtstamp_set(struct mlxsw_sp_port *mlxsw_sp_port,
 				      struct ifreq *ifr)
 {
@@ -1342,7 +1332,6 @@ static const struct net_device_ops mlxsw_sp_port_netdev_ops = {
 	.ndo_vlan_rx_add_vid	= mlxsw_sp_port_add_vid,
 	.ndo_vlan_rx_kill_vid	= mlxsw_sp_port_kill_vid,
 	.ndo_set_features	= mlxsw_sp_set_features,
-	.ndo_get_devlink_port	= mlxsw_sp_port_get_devlink_port,
 	.ndo_eth_ioctl		= mlxsw_sp_port_ioctl,
 };
 
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 5efc07751c8d..f50dada2bb8e 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -194,15 +194,6 @@ void ocelot_port_devlink_teardown(struct ocelot *ocelot, int port)
 	devlink_port_unregister(dlp);
 }
 
-static struct devlink_port *ocelot_get_devlink_port(struct net_device *dev)
-{
-	struct ocelot_port_private *priv = netdev_priv(dev);
-	struct ocelot *ocelot = priv->port.ocelot;
-	int port = priv->port.index;
-
-	return &ocelot->devlink_ports[port];
-}
-
 int ocelot_setup_tc_cls_flower(struct ocelot_port_private *priv,
 			       struct flow_cls_offload *f,
 			       bool ingress)
@@ -925,7 +916,6 @@ static const struct net_device_ops ocelot_port_netdev_ops = {
 	.ndo_set_features		= ocelot_set_features,
 	.ndo_setup_tc			= ocelot_setup_tc,
 	.ndo_eth_ioctl			= ocelot_ioctl,
-	.ndo_get_devlink_port		= ocelot_get_devlink_port,
 };
 
 struct net_device *ocelot_port_to_netdev(struct ocelot *ocelot, int port)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index dd56207df246..90707346a4ef 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -445,6 +445,4 @@ int nfp_app_nic_vnic_alloc(struct nfp_app *app, struct nfp_net *nn,
 int nfp_app_nic_vnic_init_phy_port(struct nfp_pf *pf, struct nfp_app *app,
 				   struct nfp_net *nn, unsigned int id);
 
-struct devlink_port *nfp_devlink_get_devlink_port(struct net_device *netdev);
-
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index f3f0f11d8b52..8bfd48d50ef0 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -362,14 +362,3 @@ void nfp_devlink_port_unregister(struct nfp_port *port)
 {
 	devl_port_unregister(&port->dl_port);
 }
-
-struct devlink_port *nfp_devlink_get_devlink_port(struct net_device *netdev)
-{
-	struct nfp_port *port;
-
-	port = nfp_port_from_netdev(netdev);
-	if (!port)
-		return NULL;
-
-	return &port->dl_port;
-}
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index a5ca5c4a7896..8c1a870bc0e5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2013,7 +2013,6 @@ const struct net_device_ops nfp_nfd3_netdev_ops = {
 	.ndo_get_phys_port_name	= nfp_net_get_phys_port_name,
 	.ndo_bpf		= nfp_net_xdp,
 	.ndo_xsk_wakeup		= nfp_net_xsk_wakeup,
-	.ndo_get_devlink_port	= nfp_devlink_get_devlink_port,
 	.ndo_bridge_getlink     = nfp_net_bridge_getlink,
 	.ndo_bridge_setlink     = nfp_net_bridge_setlink,
 };
@@ -2044,7 +2043,6 @@ const struct net_device_ops nfp_nfdk_netdev_ops = {
 	.ndo_features_check	= nfp_net_features_check,
 	.ndo_get_phys_port_name	= nfp_net_get_phys_port_name,
 	.ndo_bpf		= nfp_net_xdp,
-	.ndo_get_devlink_port	= nfp_devlink_get_devlink_port,
 	.ndo_bridge_getlink     = nfp_net_bridge_getlink,
 	.ndo_bridge_setlink     = nfp_net_bridge_setlink,
 };
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index a6b6ca1fd55e..3af1229a3f08 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -275,7 +275,6 @@ const struct net_device_ops nfp_repr_netdev_ops = {
 	.ndo_set_features	= nfp_port_set_features,
 	.ndo_set_mac_address    = eth_mac_addr,
 	.ndo_get_port_parent_id	= nfp_port_get_port_parent_id,
-	.ndo_get_devlink_port	= nfp_devlink_get_devlink_port,
 };
 
 void
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index b2a08f3f0e93..0dbdce4dc077 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -1380,13 +1380,6 @@ static void am65_cpsw_nuss_ndo_get_stats(struct net_device *dev,
 	stats->tx_dropped	= dev->stats.tx_dropped;
 }
 
-static struct devlink_port *am65_cpsw_ndo_get_devlink_port(struct net_device *ndev)
-{
-	struct am65_cpsw_port *port = am65_ndev_to_port(ndev);
-
-	return &port->devlink_port;
-}
-
 static const struct net_device_ops am65_cpsw_nuss_netdev_ops = {
 	.ndo_open		= am65_cpsw_nuss_ndo_slave_open,
 	.ndo_stop		= am65_cpsw_nuss_ndo_slave_stop,
@@ -1400,7 +1393,6 @@ static const struct net_device_ops am65_cpsw_nuss_netdev_ops = {
 	.ndo_vlan_rx_kill_vid	= am65_cpsw_nuss_ndo_slave_kill_vid,
 	.ndo_eth_ioctl		= am65_cpsw_nuss_ndo_slave_ioctl,
 	.ndo_setup_tc           = am65_cpsw_qos_ndo_setup_tc,
-	.ndo_get_devlink_port   = am65_cpsw_ndo_get_devlink_port,
 };
 
 static void am65_cpsw_nuss_mac_config(struct phylink_config *config, unsigned int mode,
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index e7cbae743e86..6db6a75ff9b9 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -238,13 +238,6 @@ nsim_set_features(struct net_device *dev, netdev_features_t features)
 	return 0;
 }
 
-static struct devlink_port *nsim_get_devlink_port(struct net_device *dev)
-{
-	struct netdevsim *ns = netdev_priv(dev);
-
-	return &ns->nsim_dev_port->devlink_port;
-}
-
 static const struct net_device_ops nsim_netdev_ops = {
 	.ndo_start_xmit		= nsim_start_xmit,
 	.ndo_set_rx_mode	= nsim_set_rx_mode,
@@ -263,7 +256,6 @@ static const struct net_device_ops nsim_netdev_ops = {
 	.ndo_setup_tc		= nsim_setup_tc,
 	.ndo_set_features	= nsim_set_features,
 	.ndo_bpf		= nsim_bpf,
-	.ndo_get_devlink_port	= nsim_get_devlink_port,
 };
 
 static const struct net_device_ops nsim_vf_netdev_ops = {
@@ -275,7 +267,6 @@ static const struct net_device_ops nsim_vf_netdev_ops = {
 	.ndo_get_stats64	= nsim_get_stats64,
 	.ndo_setup_tc		= nsim_setup_tc,
 	.ndo_set_features	= nsim_set_features,
-	.ndo_get_devlink_port	= nsim_get_devlink_port,
 };
 
 static void nsim_setup(struct net_device *dev)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f048a30ea10b..d45713a06568 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1366,10 +1366,6 @@ struct netdev_net_notifier {
  *	queue id bound to an AF_XDP socket. The flags field specifies if
  *	only RX, only Tx, or both should be woken up using the flags
  *	XDP_WAKEUP_RX and XDP_WAKEUP_TX.
- * struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev);
- *	Get devlink port instance associated with a given netdev.
- *	Called with a reference on the netdevice and devlink locks only,
- *	rtnl_lock is not held.
  * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
  *			 int cmd);
  *	Add, change, delete or get information on an IPv4 tunnel.
@@ -1600,7 +1596,6 @@ struct net_device_ops {
 							  struct xdp_buff *xdp);
 	int			(*ndo_xsk_wakeup)(struct net_device *dev,
 						  u32 queue_id, u32 flags);
-	struct devlink_port *	(*ndo_get_devlink_port)(struct net_device *dev);
 	int			(*ndo_tunnel_ctl)(struct net_device *dev,
 						  struct ip_tunnel_parm *p, int cmd);
 	struct net_device *	(*ndo_get_peer_dev)(struct net_device *dev);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 158bb3f1aa0b..4176482cd03f 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2165,13 +2165,6 @@ static const struct dcbnl_rtnl_ops __maybe_unused dsa_slave_dcbnl_ops = {
 	.ieee_delapp		= dsa_slave_dcbnl_ieee_delapp,
 };
 
-static struct devlink_port *dsa_slave_get_devlink_port(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return &dp->devlink_port;
-}
-
 static void dsa_slave_get_stats64(struct net_device *dev,
 				  struct rtnl_link_stats64 *s)
 {
@@ -2219,7 +2212,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_get_stats64	= dsa_slave_get_stats64,
 	.ndo_vlan_rx_add_vid	= dsa_slave_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= dsa_slave_vlan_rx_kill_vid,
-	.ndo_get_devlink_port	= dsa_slave_get_devlink_port,
 	.ndo_change_mtu		= dsa_slave_change_mtu,
 	.ndo_fill_forward_path	= dsa_slave_fill_forward_path,
 };
-- 
cgit v1.2.3


From aa3496accc412b3d975e4ee5d06076d73394d8b5 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 4 Nov 2022 00:39:55 +0530
Subject: bpf: Refactor kptr_off_tab into btf_record

To prepare the BPF verifier to handle special fields in both map values
and program allocated types coming from program BTF, we need to refactor
the kptr_off_tab handling code into something more generic and reusable
across both cases to avoid code duplication.

Later patches also require passing this data to helpers at runtime, so
that they can work on user defined types, initialize them, destruct
them, etc.

The main observation is that both map values and such allocated types
point to a type in program BTF, hence they can be handled similarly. We
can prepare a field metadata table for both cases and store them in
struct bpf_map or struct btf depending on the use case.

Hence, refactor the code into generic btf_record and btf_field member
structs. The btf_record represents the fields of a specific btf_type in
user BTF. The cnt indicates the number of special fields we successfully
recognized, and field_mask is a bitmask of fields that were found, to
enable quick determination of availability of a certain field.

Subsequently, refactor the rest of the code to work with these generic
types, remove assumptions about kptr and kptr_off_tab, rename variables
to more meaningful names, etc.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221103191013.1236066-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     | 125 +++++++++++++++--------
 include/linux/btf.h     |   3 +-
 kernel/bpf/arraymap.c   |  13 ++-
 kernel/bpf/btf.c        |  67 ++++++------
 kernel/bpf/hashtab.c    |  14 ++-
 kernel/bpf/map_in_map.c |  14 ++-
 kernel/bpf/syscall.c    | 263 +++++++++++++++++++++++++++---------------------
 kernel/bpf/verifier.c   |  96 +++++++++---------
 8 files changed, 332 insertions(+), 263 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8d948bfcb984..5f2a42033a37 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -165,35 +165,41 @@ struct bpf_map_ops {
 };
 
 enum {
-	/* Support at most 8 pointers in a BPF map value */
-	BPF_MAP_VALUE_OFF_MAX = 8,
-	BPF_MAP_OFF_ARR_MAX   = BPF_MAP_VALUE_OFF_MAX +
+	/* Support at most 8 pointers in a BTF type */
+	BTF_FIELDS_MAX	      = 8,
+	BPF_MAP_OFF_ARR_MAX   = BTF_FIELDS_MAX +
 				1 + /* for bpf_spin_lock */
 				1,  /* for bpf_timer */
 };
 
-enum bpf_kptr_type {
-	BPF_KPTR_UNREF,
-	BPF_KPTR_REF,
+enum btf_field_type {
+	BPF_KPTR_UNREF = (1 << 2),
+	BPF_KPTR_REF   = (1 << 3),
+	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
 };
 
-struct bpf_map_value_off_desc {
+struct btf_field_kptr {
+	struct btf *btf;
+	struct module *module;
+	btf_dtor_kfunc_t dtor;
+	u32 btf_id;
+};
+
+struct btf_field {
 	u32 offset;
-	enum bpf_kptr_type type;
-	struct {
-		struct btf *btf;
-		struct module *module;
-		btf_dtor_kfunc_t dtor;
-		u32 btf_id;
-	} kptr;
+	enum btf_field_type type;
+	union {
+		struct btf_field_kptr kptr;
+	};
 };
 
-struct bpf_map_value_off {
-	u32 nr_off;
-	struct bpf_map_value_off_desc off[];
+struct btf_record {
+	u32 cnt;
+	u32 field_mask;
+	struct btf_field fields[];
 };
 
-struct bpf_map_off_arr {
+struct btf_field_offs {
 	u32 cnt;
 	u32 field_off[BPF_MAP_OFF_ARR_MAX];
 	u8 field_sz[BPF_MAP_OFF_ARR_MAX];
@@ -215,7 +221,7 @@ struct bpf_map {
 	u64 map_extra; /* any per-map-type extra fields */
 	u32 map_flags;
 	int spin_lock_off; /* >=0 valid offset, <0 error */
-	struct bpf_map_value_off *kptr_off_tab;
+	struct btf_record *record;
 	int timer_off; /* >=0 valid offset, <0 error */
 	u32 id;
 	int numa_node;
@@ -227,7 +233,7 @@ struct bpf_map {
 	struct obj_cgroup *objcg;
 #endif
 	char name[BPF_OBJ_NAME_LEN];
-	struct bpf_map_off_arr *off_arr;
+	struct btf_field_offs *field_offs;
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
 	 */
@@ -251,6 +257,37 @@ struct bpf_map {
 	bool frozen; /* write-once; write-protected by freeze_mutex */
 };
 
+static inline u32 btf_field_type_size(enum btf_field_type type)
+{
+	switch (type) {
+	case BPF_KPTR_UNREF:
+	case BPF_KPTR_REF:
+		return sizeof(u64);
+	default:
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+}
+
+static inline u32 btf_field_type_align(enum btf_field_type type)
+{
+	switch (type) {
+	case BPF_KPTR_UNREF:
+	case BPF_KPTR_REF:
+		return __alignof__(u64);
+	default:
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+}
+
+static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type)
+{
+	if (IS_ERR_OR_NULL(rec))
+		return false;
+	return rec->field_mask & type;
+}
+
 static inline bool map_value_has_spin_lock(const struct bpf_map *map)
 {
 	return map->spin_lock_off >= 0;
@@ -261,23 +298,19 @@ static inline bool map_value_has_timer(const struct bpf_map *map)
 	return map->timer_off >= 0;
 }
 
-static inline bool map_value_has_kptrs(const struct bpf_map *map)
-{
-	return !IS_ERR_OR_NULL(map->kptr_off_tab);
-}
-
 static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
 {
 	if (unlikely(map_value_has_spin_lock(map)))
 		memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock));
 	if (unlikely(map_value_has_timer(map)))
 		memset(dst + map->timer_off, 0, sizeof(struct bpf_timer));
-	if (unlikely(map_value_has_kptrs(map))) {
-		struct bpf_map_value_off *tab = map->kptr_off_tab;
+	if (!IS_ERR_OR_NULL(map->record)) {
+		struct btf_field *fields = map->record->fields;
+		u32 cnt = map->record->cnt;
 		int i;
 
-		for (i = 0; i < tab->nr_off; i++)
-			*(u64 *)(dst + tab->off[i].offset) = 0;
+		for (i = 0; i < cnt; i++)
+			memset(dst + fields[i].offset, 0, btf_field_type_size(fields[i].type));
 	}
 }
 
@@ -303,7 +336,7 @@ static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, b
 	u32 curr_off = 0;
 	int i;
 
-	if (likely(!map->off_arr)) {
+	if (likely(!map->field_offs)) {
 		if (long_memcpy)
 			bpf_long_memcpy(dst, src, round_up(map->value_size, 8));
 		else
@@ -311,11 +344,12 @@ static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, b
 		return;
 	}
 
-	for (i = 0; i < map->off_arr->cnt; i++) {
-		u32 next_off = map->off_arr->field_off[i];
+	for (i = 0; i < map->field_offs->cnt; i++) {
+		u32 next_off = map->field_offs->field_off[i];
+		u32 sz = next_off - curr_off;
 
-		memcpy(dst + curr_off, src + curr_off, next_off - curr_off);
-		curr_off += map->off_arr->field_sz[i];
+		memcpy(dst + curr_off, src + curr_off, sz);
+		curr_off += map->field_offs->field_sz[i];
 	}
 	memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off);
 }
@@ -335,16 +369,17 @@ static inline void zero_map_value(struct bpf_map *map, void *dst)
 	u32 curr_off = 0;
 	int i;
 
-	if (likely(!map->off_arr)) {
+	if (likely(!map->field_offs)) {
 		memset(dst, 0, map->value_size);
 		return;
 	}
 
-	for (i = 0; i < map->off_arr->cnt; i++) {
-		u32 next_off = map->off_arr->field_off[i];
+	for (i = 0; i < map->field_offs->cnt; i++) {
+		u32 next_off = map->field_offs->field_off[i];
+		u32 sz = next_off - curr_off;
 
-		memset(dst + curr_off, 0, next_off - curr_off);
-		curr_off += map->off_arr->field_sz[i];
+		memset(dst + curr_off, 0, sz);
+		curr_off += map->field_offs->field_sz[i];
 	}
 	memset(dst + curr_off, 0, map->value_size - curr_off);
 }
@@ -1699,11 +1734,13 @@ void bpf_prog_put(struct bpf_prog *prog);
 void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
 void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
 
-struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset);
-void bpf_map_free_kptr_off_tab(struct bpf_map *map);
-struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map);
-bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b);
-void bpf_map_free_kptrs(struct bpf_map *map, void *map_value);
+struct btf_field *btf_record_find(const struct btf_record *rec,
+				  u32 offset, enum btf_field_type type);
+void btf_record_free(struct btf_record *rec);
+void bpf_map_free_record(struct bpf_map *map);
+struct btf_record *btf_record_dup(const struct btf_record *rec);
+bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
+void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
 
 struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 86aad9b2ce02..9e62717cdc7a 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -163,8 +163,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   u32 expected_offset, u32 expected_size);
 int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
 int btf_find_timer(const struct btf *btf, const struct btf_type *t);
-struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
-					  const struct btf_type *t);
+struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t);
 bool btf_type_is_void(const struct btf_type *t);
 s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
 const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 832b2659e96e..417f84342e98 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -310,8 +310,7 @@ static void check_and_free_fields(struct bpf_array *arr, void *val)
 {
 	if (map_value_has_timer(&arr->map))
 		bpf_timer_cancel_and_free(val + arr->map.timer_off);
-	if (map_value_has_kptrs(&arr->map))
-		bpf_map_free_kptrs(&arr->map, val);
+	bpf_obj_free_fields(arr->map.record, val);
 }
 
 /* Called from syscall or from eBPF program */
@@ -409,7 +408,7 @@ static void array_map_free_timers(struct bpf_map *map)
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 
-	/* We don't reset or free kptr on uref dropping to zero. */
+	/* We don't reset or free fields other than timer on uref dropping to zero. */
 	if (!map_value_has_timer(map))
 		return;
 
@@ -423,22 +422,22 @@ static void array_map_free(struct bpf_map *map)
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 
-	if (map_value_has_kptrs(map)) {
+	if (!IS_ERR_OR_NULL(map->record)) {
 		if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 			for (i = 0; i < array->map.max_entries; i++) {
 				void __percpu *pptr = array->pptrs[i & array->index_mask];
 				int cpu;
 
 				for_each_possible_cpu(cpu) {
-					bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu));
+					bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu));
 					cond_resched();
 				}
 			}
 		} else {
 			for (i = 0; i < array->map.max_entries; i++)
-				bpf_map_free_kptrs(map, array_map_elem_ptr(array, i));
+				bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i));
 		}
-		bpf_map_free_kptr_off_tab(map);
+		bpf_map_free_record(map);
 	}
 
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f4d21eef6ebd..8391a77138ee 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3191,7 +3191,7 @@ static void btf_struct_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
-enum btf_field_type {
+enum btf_field_info_type {
 	BTF_FIELD_SPIN_LOCK,
 	BTF_FIELD_TIMER,
 	BTF_FIELD_KPTR,
@@ -3203,9 +3203,9 @@ enum {
 };
 
 struct btf_field_info {
-	u32 type_id;
+	enum btf_field_type type;
 	u32 off;
-	enum bpf_kptr_type type;
+	u32 type_id;
 };
 
 static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
@@ -3222,7 +3222,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
 static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 			 u32 off, int sz, struct btf_field_info *info)
 {
-	enum bpf_kptr_type type;
+	enum btf_field_type type;
 	u32 res_id;
 
 	/* Permit modifiers on the pointer itself */
@@ -3259,7 +3259,7 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 
 static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
 				 const char *name, int sz, int align,
-				 enum btf_field_type field_type,
+				 enum btf_field_info_type field_type,
 				 struct btf_field_info *info, int info_cnt)
 {
 	const struct btf_member *member;
@@ -3311,7 +3311,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
 
 static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 				const char *name, int sz, int align,
-				enum btf_field_type field_type,
+				enum btf_field_info_type field_type,
 				struct btf_field_info *info, int info_cnt)
 {
 	const struct btf_var_secinfo *vsi;
@@ -3360,7 +3360,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 }
 
 static int btf_find_field(const struct btf *btf, const struct btf_type *t,
-			  enum btf_field_type field_type,
+			  enum btf_field_info_type field_type,
 			  struct btf_field_info *info, int info_cnt)
 {
 	const char *name;
@@ -3423,14 +3423,13 @@ int btf_find_timer(const struct btf *btf, const struct btf_type *t)
 	return info.off;
 }
 
-struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
-					  const struct btf_type *t)
+struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t)
 {
-	struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX];
-	struct bpf_map_value_off *tab;
+	struct btf_field_info info_arr[BTF_FIELDS_MAX];
 	struct btf *kernel_btf = NULL;
 	struct module *mod = NULL;
-	int ret, i, nr_off;
+	struct btf_record *rec;
+	int ret, i, cnt;
 
 	ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr));
 	if (ret < 0)
@@ -3438,12 +3437,12 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
 	if (!ret)
 		return NULL;
 
-	nr_off = ret;
-	tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN);
-	if (!tab)
+	cnt = ret;
+	rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN);
+	if (!rec)
 		return ERR_PTR(-ENOMEM);
-
-	for (i = 0; i < nr_off; i++) {
+	rec->cnt = 0;
+	for (i = 0; i < cnt; i++) {
 		const struct btf_type *t;
 		s32 id;
 
@@ -3500,28 +3499,24 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
 				ret = -EINVAL;
 				goto end_mod;
 			}
-			tab->off[i].kptr.dtor = (void *)addr;
+			rec->fields[i].kptr.dtor = (void *)addr;
 		}
 
-		tab->off[i].offset = info_arr[i].off;
-		tab->off[i].type = info_arr[i].type;
-		tab->off[i].kptr.btf_id = id;
-		tab->off[i].kptr.btf = kernel_btf;
-		tab->off[i].kptr.module = mod;
+		rec->field_mask |= info_arr[i].type;
+		rec->fields[i].offset = info_arr[i].off;
+		rec->fields[i].type = info_arr[i].type;
+		rec->fields[i].kptr.btf_id = id;
+		rec->fields[i].kptr.btf = kernel_btf;
+		rec->fields[i].kptr.module = mod;
+		rec->cnt++;
 	}
-	tab->nr_off = nr_off;
-	return tab;
+	return rec;
 end_mod:
 	module_put(mod);
 end_btf:
 	btf_put(kernel_btf);
 end:
-	while (i--) {
-		btf_put(tab->off[i].kptr.btf);
-		if (tab->off[i].kptr.module)
-			module_put(tab->off[i].kptr.module);
-	}
-	kfree(tab);
+	btf_record_free(rec);
 	return ERR_PTR(ret);
 }
 
@@ -6370,7 +6365,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 
 		/* kptr_get is only true for kfunc */
 		if (i == 0 && kptr_get) {
-			struct bpf_map_value_off_desc *off_desc;
+			struct btf_field *kptr_field;
 
 			if (reg->type != PTR_TO_MAP_VALUE) {
 				bpf_log(log, "arg#0 expected pointer to map value\n");
@@ -6386,8 +6381,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				return -EINVAL;
 			}
 
-			off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value);
-			if (!off_desc || off_desc->type != BPF_KPTR_REF) {
+			kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
+			if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
 				bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n",
 					reg->off + reg->var_off.value);
 				return -EINVAL;
@@ -6406,8 +6401,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 					func_name, i, btf_type_str(ref_t), ref_tname);
 				return -EINVAL;
 			}
-			if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf,
-						  off_desc->kptr.btf_id, true)) {
+			if (!btf_struct_ids_match(log, btf, ref_id, 0, kptr_field->kptr.btf,
+						  kptr_field->kptr.btf_id, true)) {
 				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n",
 					func_name, i, btf_type_str(ref_t), ref_tname);
 				return -EINVAL;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index f39ee3e05589..c5ea8f9bb7a9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -238,21 +238,20 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
 	}
 }
 
-static void htab_free_prealloced_kptrs(struct bpf_htab *htab)
+static void htab_free_prealloced_fields(struct bpf_htab *htab)
 {
 	u32 num_entries = htab->map.max_entries;
 	int i;
 
-	if (!map_value_has_kptrs(&htab->map))
+	if (IS_ERR_OR_NULL(htab->map.record))
 		return;
 	if (htab_has_extra_elems(htab))
 		num_entries += num_possible_cpus();
-
 	for (i = 0; i < num_entries; i++) {
 		struct htab_elem *elem;
 
 		elem = get_htab_elem(htab, i);
-		bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8));
+		bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
 		cond_resched();
 	}
 }
@@ -766,8 +765,7 @@ static void check_and_free_fields(struct bpf_htab *htab,
 
 	if (map_value_has_timer(&htab->map))
 		bpf_timer_cancel_and_free(map_value + htab->map.timer_off);
-	if (map_value_has_kptrs(&htab->map))
-		bpf_map_free_kptrs(&htab->map, map_value);
+	bpf_obj_free_fields(htab->map.record, map_value);
 }
 
 /* It is called from the bpf_lru_list when the LRU needs to delete
@@ -1517,11 +1515,11 @@ static void htab_map_free(struct bpf_map *map)
 	if (!htab_is_prealloc(htab)) {
 		delete_all_elements(htab);
 	} else {
-		htab_free_prealloced_kptrs(htab);
+		htab_free_prealloced_fields(htab);
 		prealloc_destroy(htab);
 	}
 
-	bpf_map_free_kptr_off_tab(map);
+	bpf_map_free_record(map);
 	free_percpu(htab->extra_elems);
 	bpf_map_area_free(htab->buckets);
 	bpf_mem_alloc_destroy(&htab->pcpu_ma);
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 135205d0d560..d6c662183f88 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -52,7 +52,15 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 	inner_map_meta->max_entries = inner_map->max_entries;
 	inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
 	inner_map_meta->timer_off = inner_map->timer_off;
-	inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map);
+	inner_map_meta->record = btf_record_dup(inner_map->record);
+	if (IS_ERR(inner_map_meta->record)) {
+		/* btf_record_dup returns NULL or valid pointer in case of
+		 * invalid/empty/valid, but ERR_PTR in case of errors. During
+		 * equality NULL or IS_ERR is equivalent.
+		 */
+		fdput(f);
+		return ERR_CAST(inner_map_meta->record);
+	}
 	if (inner_map->btf) {
 		btf_get(inner_map->btf);
 		inner_map_meta->btf = inner_map->btf;
@@ -72,7 +80,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 
 void bpf_map_meta_free(struct bpf_map *map_meta)
 {
-	bpf_map_free_kptr_off_tab(map_meta);
+	bpf_map_free_record(map_meta);
 	btf_put(map_meta->btf);
 	kfree(map_meta);
 }
@@ -86,7 +94,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
 		meta0->value_size == meta1->value_size &&
 		meta0->timer_off == meta1->timer_off &&
 		meta0->map_flags == meta1->map_flags &&
-		bpf_map_equal_kptr_off_tab(meta0, meta1);
+		btf_record_equal(meta0->record, meta1->record);
 }
 
 void *bpf_map_fd_get_ptr(struct bpf_map *map,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5887592eeb93..b80c0e2eb73f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -495,114 +495,134 @@ static void bpf_map_release_memcg(struct bpf_map *map)
 }
 #endif
 
-static int bpf_map_kptr_off_cmp(const void *a, const void *b)
+static int btf_field_cmp(const void *a, const void *b)
 {
-	const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;
+	const struct btf_field *f1 = a, *f2 = b;
 
-	if (off_desc1->offset < off_desc2->offset)
+	if (f1->offset < f2->offset)
 		return -1;
-	else if (off_desc1->offset > off_desc2->offset)
+	else if (f1->offset > f2->offset)
 		return 1;
 	return 0;
 }
 
-struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
+struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
+				  enum btf_field_type type)
 {
-	/* Since members are iterated in btf_find_field in increasing order,
-	 * offsets appended to kptr_off_tab are in increasing order, so we can
-	 * do bsearch to find exact match.
-	 */
-	struct bpf_map_value_off *tab;
+	struct btf_field *field;
 
-	if (!map_value_has_kptrs(map))
+	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type))
+		return NULL;
+	field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
+	if (!field || !(field->type & type))
 		return NULL;
-	tab = map->kptr_off_tab;
-	return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp);
+	return field;
 }
 
-void bpf_map_free_kptr_off_tab(struct bpf_map *map)
+void btf_record_free(struct btf_record *rec)
 {
-	struct bpf_map_value_off *tab = map->kptr_off_tab;
 	int i;
 
-	if (!map_value_has_kptrs(map))
+	if (IS_ERR_OR_NULL(rec))
 		return;
-	for (i = 0; i < tab->nr_off; i++) {
-		if (tab->off[i].kptr.module)
-			module_put(tab->off[i].kptr.module);
-		btf_put(tab->off[i].kptr.btf);
+	for (i = 0; i < rec->cnt; i++) {
+		switch (rec->fields[i].type) {
+		case BPF_KPTR_UNREF:
+		case BPF_KPTR_REF:
+			if (rec->fields[i].kptr.module)
+				module_put(rec->fields[i].kptr.module);
+			btf_put(rec->fields[i].kptr.btf);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			continue;
+		}
 	}
-	kfree(tab);
-	map->kptr_off_tab = NULL;
+	kfree(rec);
+}
+
+void bpf_map_free_record(struct bpf_map *map)
+{
+	btf_record_free(map->record);
+	map->record = NULL;
 }
 
-struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
+struct btf_record *btf_record_dup(const struct btf_record *rec)
 {
-	struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab;
-	int size, i;
+	const struct btf_field *fields;
+	struct btf_record *new_rec;
+	int ret, size, i;
 
-	if (!map_value_has_kptrs(map))
-		return ERR_PTR(-ENOENT);
-	size = offsetof(struct bpf_map_value_off, off[tab->nr_off]);
-	new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN);
-	if (!new_tab)
+	if (IS_ERR_OR_NULL(rec))
+		return NULL;
+	size = offsetof(struct btf_record, fields[rec->cnt]);
+	new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
+	if (!new_rec)
 		return ERR_PTR(-ENOMEM);
-	/* Do a deep copy of the kptr_off_tab */
-	for (i = 0; i < tab->nr_off; i++) {
-		btf_get(tab->off[i].kptr.btf);
-		if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) {
-			while (i--) {
-				if (tab->off[i].kptr.module)
-					module_put(tab->off[i].kptr.module);
-				btf_put(tab->off[i].kptr.btf);
+	/* Do a deep copy of the btf_record */
+	fields = rec->fields;
+	new_rec->cnt = 0;
+	for (i = 0; i < rec->cnt; i++) {
+		switch (fields[i].type) {
+		case BPF_KPTR_UNREF:
+		case BPF_KPTR_REF:
+			btf_get(fields[i].kptr.btf);
+			if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
+				ret = -ENXIO;
+				goto free;
 			}
-			kfree(new_tab);
-			return ERR_PTR(-ENXIO);
+			break;
+		default:
+			ret = -EFAULT;
+			WARN_ON_ONCE(1);
+			goto free;
 		}
+		new_rec->cnt++;
 	}
-	return new_tab;
+	return new_rec;
+free:
+	btf_record_free(new_rec);
+	return ERR_PTR(ret);
 }
 
-bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
+bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
 {
-	struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
-	bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b);
+	bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
 	int size;
 
-	if (!a_has_kptr && !b_has_kptr)
+	if (!a_has_fields && !b_has_fields)
 		return true;
-	if (a_has_kptr != b_has_kptr)
+	if (a_has_fields != b_has_fields)
 		return false;
-	if (tab_a->nr_off != tab_b->nr_off)
+	if (rec_a->cnt != rec_b->cnt)
 		return false;
-	size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]);
-	return !memcmp(tab_a, tab_b, size);
+	size = offsetof(struct btf_record, fields[rec_a->cnt]);
+	return !memcmp(rec_a, rec_b, size);
 }
 
-/* Caller must ensure map_value_has_kptrs is true. Note that this function can
- * be called on a map value while the map_value is visible to BPF programs, as
- * it ensures the correct synchronization, and we already enforce the same using
- * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs.
- */
-void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
+void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 {
-	struct bpf_map_value_off *tab = map->kptr_off_tab;
-	unsigned long *btf_id_ptr;
+	const struct btf_field *fields;
 	int i;
 
-	for (i = 0; i < tab->nr_off; i++) {
-		struct bpf_map_value_off_desc *off_desc = &tab->off[i];
-		unsigned long old_ptr;
-
-		btf_id_ptr = map_value + off_desc->offset;
-		if (off_desc->type == BPF_KPTR_UNREF) {
-			u64 *p = (u64 *)btf_id_ptr;
-
-			WRITE_ONCE(*p, 0);
+	if (IS_ERR_OR_NULL(rec))
+		return;
+	fields = rec->fields;
+	for (i = 0; i < rec->cnt; i++) {
+		const struct btf_field *field = &fields[i];
+		void *field_ptr = obj + field->offset;
+
+		switch (fields[i].type) {
+		case BPF_KPTR_UNREF:
+			WRITE_ONCE(*(u64 *)field_ptr, 0);
+			break;
+		case BPF_KPTR_REF:
+			field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
+			break;
+		default:
+			WARN_ON_ONCE(1);
 			continue;
 		}
-		old_ptr = xchg(btf_id_ptr, 0);
-		off_desc->kptr.dtor((void *)old_ptr);
 	}
 }
 
@@ -612,10 +632,10 @@ static void bpf_map_free_deferred(struct work_struct *work)
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
 	security_bpf_map_free(map);
-	kfree(map->off_arr);
+	kfree(map->field_offs);
 	bpf_map_release_memcg(map);
 	/* implementation dependent freeing, map_free callback also does
-	 * bpf_map_free_kptr_off_tab, if needed.
+	 * bpf_map_free_record, if needed.
 	 */
 	map->ops->map_free(map);
 }
@@ -779,7 +799,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
 	int err;
 
 	if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
-	    map_value_has_timer(map) || map_value_has_kptrs(map))
+	    map_value_has_timer(map) || !IS_ERR_OR_NULL(map->record))
 		return -ENOTSUPP;
 
 	if (!(vma->vm_flags & VM_SHARED))
@@ -906,7 +926,7 @@ int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
-static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv)
+static int map_field_offs_cmp(const void *_a, const void *_b, const void *priv)
 {
 	const u32 a = *(const u32 *)_a;
 	const u32 b = *(const u32 *)_b;
@@ -918,15 +938,15 @@ static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv)
 	return 0;
 }
 
-static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv)
+static void map_field_offs_swap(void *_a, void *_b, int size, const void *priv)
 {
 	struct bpf_map *map = (struct bpf_map *)priv;
-	u32 *off_base = map->off_arr->field_off;
+	u32 *off_base = map->field_offs->field_off;
 	u32 *a = _a, *b = _b;
 	u8 *sz_a, *sz_b;
 
-	sz_a = map->off_arr->field_sz + (a - off_base);
-	sz_b = map->off_arr->field_sz + (b - off_base);
+	sz_a = map->field_offs->field_sz + (a - off_base);
+	sz_b = map->field_offs->field_sz + (b - off_base);
 
 	swap(*a, *b);
 	swap(*sz_a, *sz_b);
@@ -936,51 +956,51 @@ static int bpf_map_alloc_off_arr(struct bpf_map *map)
 {
 	bool has_spin_lock = map_value_has_spin_lock(map);
 	bool has_timer = map_value_has_timer(map);
-	bool has_kptrs = map_value_has_kptrs(map);
-	struct bpf_map_off_arr *off_arr;
+	bool has_fields = !IS_ERR_OR_NULL(map->record);
+	struct btf_field_offs *fo;
 	u32 i;
 
-	if (!has_spin_lock && !has_timer && !has_kptrs) {
-		map->off_arr = NULL;
+	if (!has_spin_lock && !has_timer && !has_fields) {
+		map->field_offs = NULL;
 		return 0;
 	}
 
-	off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN);
-	if (!off_arr)
+	fo = kmalloc(sizeof(*map->field_offs), GFP_KERNEL | __GFP_NOWARN);
+	if (!fo)
 		return -ENOMEM;
-	map->off_arr = off_arr;
+	map->field_offs = fo;
 
-	off_arr->cnt = 0;
+	fo->cnt = 0;
 	if (has_spin_lock) {
-		i = off_arr->cnt;
+		i = fo->cnt;
 
-		off_arr->field_off[i] = map->spin_lock_off;
-		off_arr->field_sz[i] = sizeof(struct bpf_spin_lock);
-		off_arr->cnt++;
+		fo->field_off[i] = map->spin_lock_off;
+		fo->field_sz[i] = sizeof(struct bpf_spin_lock);
+		fo->cnt++;
 	}
 	if (has_timer) {
-		i = off_arr->cnt;
+		i = fo->cnt;
 
-		off_arr->field_off[i] = map->timer_off;
-		off_arr->field_sz[i] = sizeof(struct bpf_timer);
-		off_arr->cnt++;
+		fo->field_off[i] = map->timer_off;
+		fo->field_sz[i] = sizeof(struct bpf_timer);
+		fo->cnt++;
 	}
-	if (has_kptrs) {
-		struct bpf_map_value_off *tab = map->kptr_off_tab;
-		u32 *off = &off_arr->field_off[off_arr->cnt];
-		u8 *sz = &off_arr->field_sz[off_arr->cnt];
+	if (has_fields) {
+		struct btf_record *rec = map->record;
+		u32 *off = &fo->field_off[fo->cnt];
+		u8 *sz = &fo->field_sz[fo->cnt];
 
-		for (i = 0; i < tab->nr_off; i++) {
-			*off++ = tab->off[i].offset;
-			*sz++ = sizeof(u64);
+		for (i = 0; i < rec->cnt; i++) {
+			*off++ = rec->fields[i].offset;
+			*sz++ = btf_field_type_size(rec->fields[i].type);
 		}
-		off_arr->cnt += tab->nr_off;
+		fo->cnt += rec->cnt;
 	}
 
-	if (off_arr->cnt == 1)
+	if (fo->cnt == 1)
 		return 0;
-	sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]),
-	       map_off_arr_cmp, map_off_arr_swap, map);
+	sort_r(fo->field_off, fo->cnt, sizeof(fo->field_off[0]),
+	       map_field_offs_cmp, map_field_offs_swap, map);
 	return 0;
 }
 
@@ -1038,8 +1058,10 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 			return -EOPNOTSUPP;
 	}
 
-	map->kptr_off_tab = btf_parse_kptrs(btf, value_type);
-	if (map_value_has_kptrs(map)) {
+	map->record = btf_parse_fields(btf, value_type);
+	if (!IS_ERR_OR_NULL(map->record)) {
+		int i;
+
 		if (!bpf_capable()) {
 			ret = -EPERM;
 			goto free_map_tab;
@@ -1048,12 +1070,25 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 			ret = -EACCES;
 			goto free_map_tab;
 		}
-		if (map->map_type != BPF_MAP_TYPE_HASH &&
-		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
-		    map->map_type != BPF_MAP_TYPE_ARRAY &&
-		    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
-			ret = -EOPNOTSUPP;
-			goto free_map_tab;
+		for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
+			switch (map->record->field_mask & (1 << i)) {
+			case 0:
+				continue;
+			case BPF_KPTR_UNREF:
+			case BPF_KPTR_REF:
+				if (map->map_type != BPF_MAP_TYPE_HASH &&
+				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+				    map->map_type != BPF_MAP_TYPE_ARRAY &&
+				    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
+					ret = -EOPNOTSUPP;
+					goto free_map_tab;
+				}
+				break;
+			default:
+				/* Fail if map_type checks are missing for a field type */
+				ret = -EOPNOTSUPP;
+				goto free_map_tab;
+			}
 		}
 	}
 
@@ -1065,7 +1100,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 
 	return ret;
 free_map_tab:
-	bpf_map_free_kptr_off_tab(map);
+	bpf_map_free_record(map);
 	return ret;
 }
 
@@ -1186,7 +1221,7 @@ static int map_create(union bpf_attr *attr)
 free_map_sec:
 	security_bpf_map_free(map);
 free_map_off_arr:
-	kfree(map->off_arr);
+	kfree(map->field_offs);
 free_map:
 	btf_put(map->btf);
 	map->ops->map_free(map);
@@ -1883,7 +1918,7 @@ static int map_freeze(const union bpf_attr *attr)
 		return PTR_ERR(map);
 
 	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
-	    map_value_has_timer(map) || map_value_has_kptrs(map)) {
+	    map_value_has_timer(map) || !IS_ERR_OR_NULL(map->record)) {
 		fdput(f);
 		return -ENOTSUPP;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 14d350a25d5d..5ce5364ce898 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -262,7 +262,7 @@ struct bpf_call_arg_meta {
 	struct btf *ret_btf;
 	u32 ret_btf_id;
 	u32 subprogno;
-	struct bpf_map_value_off_desc *kptr_off_desc;
+	struct btf_field *kptr_field;
 	u8 uninit_dynptr_regno;
 };
 
@@ -3674,15 +3674,15 @@ int check_ptr_off_reg(struct bpf_verifier_env *env,
 }
 
 static int map_kptr_match_type(struct bpf_verifier_env *env,
-			       struct bpf_map_value_off_desc *off_desc,
+			       struct btf_field *kptr_field,
 			       struct bpf_reg_state *reg, u32 regno)
 {
-	const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id);
+	const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
 	int perm_flags = PTR_MAYBE_NULL;
 	const char *reg_name = "";
 
 	/* Only unreferenced case accepts untrusted pointers */
-	if (off_desc->type == BPF_KPTR_UNREF)
+	if (kptr_field->type == BPF_KPTR_UNREF)
 		perm_flags |= PTR_UNTRUSTED;
 
 	if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
@@ -3729,15 +3729,15 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 	 * strict mode to true for type match.
 	 */
 	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
-				  off_desc->kptr.btf, off_desc->kptr.btf_id,
-				  off_desc->type == BPF_KPTR_REF))
+				  kptr_field->kptr.btf, kptr_field->kptr.btf_id,
+				  kptr_field->type == BPF_KPTR_REF))
 		goto bad_type;
 	return 0;
 bad_type:
 	verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
 		reg_type_str(env, reg->type), reg_name);
 	verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
-	if (off_desc->type == BPF_KPTR_UNREF)
+	if (kptr_field->type == BPF_KPTR_UNREF)
 		verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
 			targ_name);
 	else
@@ -3747,7 +3747,7 @@ bad_type:
 
 static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 				 int value_regno, int insn_idx,
-				 struct bpf_map_value_off_desc *off_desc)
+				 struct btf_field *kptr_field)
 {
 	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
 	int class = BPF_CLASS(insn->code);
@@ -3757,7 +3757,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 	 *  - Reject cases where variable offset may touch kptr
 	 *  - size of access (must be BPF_DW)
 	 *  - tnum_is_const(reg->var_off)
-	 *  - off_desc->offset == off + reg->var_off.value
+	 *  - kptr_field->offset == off + reg->var_off.value
 	 */
 	/* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
 	if (BPF_MODE(insn->code) != BPF_MEM) {
@@ -3768,7 +3768,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 	/* We only allow loading referenced kptr, since it will be marked as
 	 * untrusted, similar to unreferenced kptr.
 	 */
-	if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) {
+	if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
 		verbose(env, "store to referenced kptr disallowed\n");
 		return -EACCES;
 	}
@@ -3778,19 +3778,19 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		/* We can simply mark the value_regno receiving the pointer
 		 * value from map as PTR_TO_BTF_ID, with the correct type.
 		 */
-		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf,
-				off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
+				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
 		/* For mark_ptr_or_null_reg */
 		val_reg->id = ++env->id_gen;
 	} else if (class == BPF_STX) {
 		val_reg = reg_state(env, value_regno);
 		if (!register_is_null(val_reg) &&
-		    map_kptr_match_type(env, off_desc, val_reg, value_regno))
+		    map_kptr_match_type(env, kptr_field, val_reg, value_regno))
 			return -EACCES;
 	} else if (class == BPF_ST) {
 		if (insn->imm) {
 			verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
-				off_desc->offset);
+				kptr_field->offset);
 			return -EACCES;
 		}
 	} else {
@@ -3809,7 +3809,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
 	struct bpf_reg_state *reg = &state->regs[regno];
 	struct bpf_map *map = reg->map_ptr;
-	int err;
+	struct btf_record *rec;
+	int err, i;
 
 	err = check_mem_region_access(env, regno, off, size, map->value_size,
 				      zero_size_allowed);
@@ -3839,15 +3840,18 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 	}
-	if (map_value_has_kptrs(map)) {
-		struct bpf_map_value_off *tab = map->kptr_off_tab;
-		int i;
-
-		for (i = 0; i < tab->nr_off; i++) {
-			u32 p = tab->off[i].offset;
-
-			if (reg->smin_value + off < p + sizeof(u64) &&
-			    p < reg->umax_value + off + size) {
+	if (IS_ERR_OR_NULL(map->record))
+		return 0;
+	rec = map->record;
+	for (i = 0; i < rec->cnt; i++) {
+		struct btf_field *field = &rec->fields[i];
+		u32 p = field->offset;
+
+		if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
+		    p < reg->umax_value + off + size) {
+			switch (field->type) {
+			case BPF_KPTR_UNREF:
+			case BPF_KPTR_REF:
 				if (src != ACCESS_DIRECT) {
 					verbose(env, "kptr cannot be accessed indirectly by helper\n");
 					return -EACCES;
@@ -3866,10 +3870,13 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 					return -EACCES;
 				}
 				break;
+			default:
+				verbose(env, "field cannot be accessed directly by load/store\n");
+				return -EACCES;
 			}
 		}
 	}
-	return err;
+	return 0;
 }
 
 #define MAX_PACKET_OFF 0xffff
@@ -4742,7 +4749,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (reg->type == PTR_TO_MAP_VALUE) {
-		struct bpf_map_value_off_desc *kptr_off_desc = NULL;
+		struct btf_field *kptr_field = NULL;
 
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
@@ -4756,11 +4763,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (err)
 			return err;
 		if (tnum_is_const(reg->var_off))
-			kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr,
-								  off + reg->var_off.value);
-		if (kptr_off_desc) {
-			err = check_map_kptr_access(env, regno, value_regno, insn_idx,
-						    kptr_off_desc);
+			kptr_field = btf_record_find(reg->map_ptr->record,
+						     off + reg->var_off.value, BPF_KPTR);
+		if (kptr_field) {
+			err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
 		} else if (t == BPF_READ && value_regno >= 0) {
 			struct bpf_map *map = reg->map_ptr;
 
@@ -5527,10 +5533,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 			     struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
-	struct bpf_map_value_off_desc *off_desc;
 	struct bpf_map *map_ptr = reg->map_ptr;
+	struct btf_field *kptr_field;
 	u32 kptr_off;
-	int ret;
 
 	if (!tnum_is_const(reg->var_off)) {
 		verbose(env,
@@ -5543,30 +5548,23 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 			map_ptr->name);
 		return -EINVAL;
 	}
-	if (!map_value_has_kptrs(map_ptr)) {
-		ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab);
-		if (ret == -E2BIG)
-			verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name,
-				BPF_MAP_VALUE_OFF_MAX);
-		else if (ret == -EEXIST)
-			verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name);
-		else
-			verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+	if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
+		verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
 		return -EINVAL;
 	}
 
 	meta->map_ptr = map_ptr;
 	kptr_off = reg->off + reg->var_off.value;
-	off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off);
-	if (!off_desc) {
+	kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
+	if (!kptr_field) {
 		verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
 		return -EACCES;
 	}
-	if (off_desc->type != BPF_KPTR_REF) {
+	if (kptr_field->type != BPF_KPTR_REF) {
 		verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
 		return -EACCES;
 	}
-	meta->kptr_off_desc = off_desc;
+	meta->kptr_field = kptr_field;
 	return 0;
 }
 
@@ -5788,7 +5786,7 @@ found:
 		}
 
 		if (meta->func_id == BPF_FUNC_kptr_xchg) {
-			if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno))
+			if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
 				return -EACCES;
 		} else {
 			if (arg_btf_id == BPF_PTR_POISON) {
@@ -7536,8 +7534,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
 		if (func_id == BPF_FUNC_kptr_xchg) {
-			ret_btf = meta.kptr_off_desc->kptr.btf;
-			ret_btf_id = meta.kptr_off_desc->kptr.btf_id;
+			ret_btf = meta.kptr_field->kptr.btf;
+			ret_btf_id = meta.kptr_field->kptr.btf_id;
 		} else {
 			if (fn->ret_btf_id == BPF_PTR_POISON) {
 				verbose(env, "verifier internal error:");
-- 
cgit v1.2.3


From db559117828d2448fe81ada051c60bcf39f822e9 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 4 Nov 2022 00:39:56 +0530
Subject: bpf: Consolidate spin_lock, timer management into btf_record

Now that kptr_off_tab has been refactored into btf_record, and can hold
more than one specific field type, accomodate bpf_spin_lock and
bpf_timer as well.

While they don't require any more metadata than offset, having all
special fields in one place allows us to share the same code for
allocated user defined types and handle both map values and these
allocated objects in a similar fashion.

As an optimization, we still keep spin_lock_off and timer_off offsets in
the btf_record structure, just to avoid having to find the btf_field
struct each time their offset is needed. This is mostly needed to
manipulate such objects in a map value at runtime. It's ok to hardcode
just one offset as more than one field is disallowed.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221103191013.1236066-8-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  53 ++++---
 include/linux/btf.h            |   3 +-
 kernel/bpf/arraymap.c          |  19 +--
 kernel/bpf/bpf_local_storage.c |   2 +-
 kernel/bpf/btf.c               | 323 ++++++++++++++++++++++-------------------
 kernel/bpf/hashtab.c           |  24 +--
 kernel/bpf/helpers.c           |   6 +-
 kernel/bpf/local_storage.c     |   2 +-
 kernel/bpf/map_in_map.c        |   5 +-
 kernel/bpf/syscall.c           | 135 ++++++++---------
 kernel/bpf/verifier.c          |  82 +++--------
 net/core/bpf_sk_storage.c      |   4 +-
 12 files changed, 314 insertions(+), 344 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5f2a42033a37..aae85019abde 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -166,13 +166,13 @@ struct bpf_map_ops {
 
 enum {
 	/* Support at most 8 pointers in a BTF type */
-	BTF_FIELDS_MAX	      = 8,
-	BPF_MAP_OFF_ARR_MAX   = BTF_FIELDS_MAX +
-				1 + /* for bpf_spin_lock */
-				1,  /* for bpf_timer */
+	BTF_FIELDS_MAX	      = 10,
+	BPF_MAP_OFF_ARR_MAX   = BTF_FIELDS_MAX,
 };
 
 enum btf_field_type {
+	BPF_SPIN_LOCK  = (1 << 0),
+	BPF_TIMER      = (1 << 1),
 	BPF_KPTR_UNREF = (1 << 2),
 	BPF_KPTR_REF   = (1 << 3),
 	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
@@ -196,6 +196,8 @@ struct btf_field {
 struct btf_record {
 	u32 cnt;
 	u32 field_mask;
+	int spin_lock_off;
+	int timer_off;
 	struct btf_field fields[];
 };
 
@@ -220,10 +222,8 @@ struct bpf_map {
 	u32 max_entries;
 	u64 map_extra; /* any per-map-type extra fields */
 	u32 map_flags;
-	int spin_lock_off; /* >=0 valid offset, <0 error */
-	struct btf_record *record;
-	int timer_off; /* >=0 valid offset, <0 error */
 	u32 id;
+	struct btf_record *record;
 	int numa_node;
 	u32 btf_key_type_id;
 	u32 btf_value_type_id;
@@ -257,9 +257,29 @@ struct bpf_map {
 	bool frozen; /* write-once; write-protected by freeze_mutex */
 };
 
+static inline const char *btf_field_type_name(enum btf_field_type type)
+{
+	switch (type) {
+	case BPF_SPIN_LOCK:
+		return "bpf_spin_lock";
+	case BPF_TIMER:
+		return "bpf_timer";
+	case BPF_KPTR_UNREF:
+	case BPF_KPTR_REF:
+		return "kptr";
+	default:
+		WARN_ON_ONCE(1);
+		return "unknown";
+	}
+}
+
 static inline u32 btf_field_type_size(enum btf_field_type type)
 {
 	switch (type) {
+	case BPF_SPIN_LOCK:
+		return sizeof(struct bpf_spin_lock);
+	case BPF_TIMER:
+		return sizeof(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 		return sizeof(u64);
@@ -272,6 +292,10 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 static inline u32 btf_field_type_align(enum btf_field_type type)
 {
 	switch (type) {
+	case BPF_SPIN_LOCK:
+		return __alignof__(struct bpf_spin_lock);
+	case BPF_TIMER:
+		return __alignof__(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 		return __alignof__(u64);
@@ -288,22 +312,8 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f
 	return rec->field_mask & type;
 }
 
-static inline bool map_value_has_spin_lock(const struct bpf_map *map)
-{
-	return map->spin_lock_off >= 0;
-}
-
-static inline bool map_value_has_timer(const struct bpf_map *map)
-{
-	return map->timer_off >= 0;
-}
-
 static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
 {
-	if (unlikely(map_value_has_spin_lock(map)))
-		memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock));
-	if (unlikely(map_value_has_timer(map)))
-		memset(dst + map->timer_off, 0, sizeof(struct bpf_timer));
 	if (!IS_ERR_OR_NULL(map->record)) {
 		struct btf_field *fields = map->record->fields;
 		u32 cnt = map->record->cnt;
@@ -1740,6 +1750,7 @@ void btf_record_free(struct btf_record *rec);
 void bpf_map_free_record(struct bpf_map *map);
 struct btf_record *btf_record_dup(const struct btf_record *rec);
 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
+void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
 
 struct bpf_map *bpf_map_get(u32 ufd);
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 9e62717cdc7a..282006abd062 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -163,7 +163,8 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   u32 expected_offset, u32 expected_size);
 int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
 int btf_find_timer(const struct btf *btf, const struct btf_type *t);
-struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t);
+struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
+				    u32 field_mask, u32 value_size);
 bool btf_type_is_void(const struct btf_type *t);
 s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
 const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 417f84342e98..672eb17ac421 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -306,13 +306,6 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
 	return 0;
 }
 
-static void check_and_free_fields(struct bpf_array *arr, void *val)
-{
-	if (map_value_has_timer(&arr->map))
-		bpf_timer_cancel_and_free(val + arr->map.timer_off);
-	bpf_obj_free_fields(arr->map.record, val);
-}
-
 /* Called from syscall or from eBPF program */
 static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 				 u64 map_flags)
@@ -334,13 +327,13 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 		return -EEXIST;
 
 	if (unlikely((map_flags & BPF_F_LOCK) &&
-		     !map_value_has_spin_lock(map)))
+		     !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
 		return -EINVAL;
 
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
 		copy_map_value(map, val, value);
-		check_and_free_fields(array, val);
+		bpf_obj_free_fields(array->map.record, val);
 	} else {
 		val = array->value +
 			(u64)array->elem_size * (index & array->index_mask);
@@ -348,7 +341,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 			copy_map_value_locked(map, val, value, false);
 		else
 			copy_map_value(map, val, value);
-		check_and_free_fields(array, val);
+		bpf_obj_free_fields(array->map.record, val);
 	}
 	return 0;
 }
@@ -385,7 +378,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 	pptr = array->pptrs[index & array->index_mask];
 	for_each_possible_cpu(cpu) {
 		copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
-		check_and_free_fields(array, per_cpu_ptr(pptr, cpu));
+		bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
 		off += size;
 	}
 	rcu_read_unlock();
@@ -409,11 +402,11 @@ static void array_map_free_timers(struct bpf_map *map)
 	int i;
 
 	/* We don't reset or free fields other than timer on uref dropping to zero. */
-	if (!map_value_has_timer(map))
+	if (!btf_record_has_field(map->record, BPF_TIMER))
 		return;
 
 	for (i = 0; i < array->map.max_entries; i++)
-		bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off);
+		bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
 }
 
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 93d9b1b17bc8..37020078d1c1 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -382,7 +382,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
 	    /* BPF_F_LOCK can only be used in a value with spin_lock */
 	    unlikely((map_flags & BPF_F_LOCK) &&
-		     !map_value_has_spin_lock(&smap->map)))
+		     !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
 		return ERR_PTR(-EINVAL);
 
 	if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 8391a77138ee..3dad828db13c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3205,16 +3205,20 @@ enum {
 struct btf_field_info {
 	enum btf_field_type type;
 	u32 off;
-	u32 type_id;
+	struct {
+		u32 type_id;
+	} kptr;
 };
 
 static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
-			   u32 off, int sz, struct btf_field_info *info)
+			   u32 off, int sz, enum btf_field_type field_type,
+			   struct btf_field_info *info)
 {
 	if (!__btf_type_is_struct(t))
 		return BTF_FIELD_IGNORE;
 	if (t->size != sz)
 		return BTF_FIELD_IGNORE;
+	info->type = field_type;
 	info->off = off;
 	return BTF_FIELD_FOUND;
 }
@@ -3251,28 +3255,66 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 	if (!__btf_type_is_struct(t))
 		return -EINVAL;
 
-	info->type_id = res_id;
-	info->off = off;
 	info->type = type;
+	info->off = off;
+	info->kptr.type_id = res_id;
 	return BTF_FIELD_FOUND;
 }
 
-static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
-				 const char *name, int sz, int align,
-				 enum btf_field_info_type field_type,
+static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
+			      int *align, int *sz)
+{
+	int type = 0;
+
+	if (field_mask & BPF_SPIN_LOCK) {
+		if (!strcmp(name, "bpf_spin_lock")) {
+			if (*seen_mask & BPF_SPIN_LOCK)
+				return -E2BIG;
+			*seen_mask |= BPF_SPIN_LOCK;
+			type = BPF_SPIN_LOCK;
+			goto end;
+		}
+	}
+	if (field_mask & BPF_TIMER) {
+		if (!strcmp(name, "bpf_timer")) {
+			if (*seen_mask & BPF_TIMER)
+				return -E2BIG;
+			*seen_mask |= BPF_TIMER;
+			type = BPF_TIMER;
+			goto end;
+		}
+	}
+	/* Only return BPF_KPTR when all other types with matchable names fail */
+	if (field_mask & BPF_KPTR) {
+		type = BPF_KPTR_REF;
+		goto end;
+	}
+	return 0;
+end:
+	*sz = btf_field_type_size(type);
+	*align = btf_field_type_align(type);
+	return type;
+}
+
+static int btf_find_struct_field(const struct btf *btf,
+				 const struct btf_type *t, u32 field_mask,
 				 struct btf_field_info *info, int info_cnt)
 {
+	int ret, idx = 0, align, sz, field_type;
 	const struct btf_member *member;
 	struct btf_field_info tmp;
-	int ret, idx = 0;
-	u32 i, off;
+	u32 i, off, seen_mask = 0;
 
 	for_each_member(i, t, member) {
 		const struct btf_type *member_type = btf_type_by_id(btf,
 								    member->type);
 
-		if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
+		field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off),
+						field_mask, &seen_mask, &align, &sz);
+		if (field_type == 0)
 			continue;
+		if (field_type < 0)
+			return field_type;
 
 		off = __btf_member_bit_offset(t, member);
 		if (off % 8)
@@ -3280,17 +3322,18 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
 			return -EINVAL;
 		off /= 8;
 		if (off % align)
-			return -EINVAL;
+			continue;
 
 		switch (field_type) {
-		case BTF_FIELD_SPIN_LOCK:
-		case BTF_FIELD_TIMER:
-			ret = btf_find_struct(btf, member_type, off, sz,
+		case BPF_SPIN_LOCK:
+		case BPF_TIMER:
+			ret = btf_find_struct(btf, member_type, off, sz, field_type,
 					      idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
 				return ret;
 			break;
-		case BTF_FIELD_KPTR:
+		case BPF_KPTR_UNREF:
+		case BPF_KPTR_REF:
 			ret = btf_find_kptr(btf, member_type, off, sz,
 					    idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3310,37 +3353,41 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
 }
 
 static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
-				const char *name, int sz, int align,
-				enum btf_field_info_type field_type,
-				struct btf_field_info *info, int info_cnt)
+				u32 field_mask, struct btf_field_info *info,
+				int info_cnt)
 {
+	int ret, idx = 0, align, sz, field_type;
 	const struct btf_var_secinfo *vsi;
 	struct btf_field_info tmp;
-	int ret, idx = 0;
-	u32 i, off;
+	u32 i, off, seen_mask = 0;
 
 	for_each_vsi(i, t, vsi) {
 		const struct btf_type *var = btf_type_by_id(btf, vsi->type);
 		const struct btf_type *var_type = btf_type_by_id(btf, var->type);
 
-		off = vsi->offset;
-
-		if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+		field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off),
+						field_mask, &seen_mask, &align, &sz);
+		if (field_type == 0)
 			continue;
+		if (field_type < 0)
+			return field_type;
+
+		off = vsi->offset;
 		if (vsi->size != sz)
 			continue;
 		if (off % align)
-			return -EINVAL;
+			continue;
 
 		switch (field_type) {
-		case BTF_FIELD_SPIN_LOCK:
-		case BTF_FIELD_TIMER:
-			ret = btf_find_struct(btf, var_type, off, sz,
+		case BPF_SPIN_LOCK:
+		case BPF_TIMER:
+			ret = btf_find_struct(btf, var_type, off, sz, field_type,
 					      idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
 				return ret;
 			break;
-		case BTF_FIELD_KPTR:
+		case BPF_KPTR_UNREF:
+		case BPF_KPTR_REF:
 			ret = btf_find_kptr(btf, var_type, off, sz,
 					    idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3360,78 +3407,98 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 }
 
 static int btf_find_field(const struct btf *btf, const struct btf_type *t,
-			  enum btf_field_info_type field_type,
-			  struct btf_field_info *info, int info_cnt)
+			  u32 field_mask, struct btf_field_info *info,
+			  int info_cnt)
 {
-	const char *name;
-	int sz, align;
-
-	switch (field_type) {
-	case BTF_FIELD_SPIN_LOCK:
-		name = "bpf_spin_lock";
-		sz = sizeof(struct bpf_spin_lock);
-		align = __alignof__(struct bpf_spin_lock);
-		break;
-	case BTF_FIELD_TIMER:
-		name = "bpf_timer";
-		sz = sizeof(struct bpf_timer);
-		align = __alignof__(struct bpf_timer);
-		break;
-	case BTF_FIELD_KPTR:
-		name = NULL;
-		sz = sizeof(u64);
-		align = 8;
-		break;
-	default:
-		return -EFAULT;
-	}
-
 	if (__btf_type_is_struct(t))
-		return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt);
+		return btf_find_struct_field(btf, t, field_mask, info, info_cnt);
 	else if (btf_type_is_datasec(t))
-		return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt);
+		return btf_find_datasec_var(btf, t, field_mask, info, info_cnt);
 	return -EINVAL;
 }
 
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
+			  struct btf_field_info *info)
 {
-	struct btf_field_info info;
+	struct module *mod = NULL;
+	const struct btf_type *t;
+	struct btf *kernel_btf;
 	int ret;
+	s32 id;
 
-	ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1);
-	if (ret < 0)
-		return ret;
-	if (!ret)
-		return -ENOENT;
-	return info.off;
-}
+	/* Find type in map BTF, and use it to look up the matching type
+	 * in vmlinux or module BTFs, by name and kind.
+	 */
+	t = btf_type_by_id(btf, info->kptr.type_id);
+	id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
+			     &kernel_btf);
+	if (id < 0)
+		return id;
+
+	/* Find and stash the function pointer for the destruction function that
+	 * needs to be eventually invoked from the map free path.
+	 */
+	if (info->type == BPF_KPTR_REF) {
+		const struct btf_type *dtor_func;
+		const char *dtor_func_name;
+		unsigned long addr;
+		s32 dtor_btf_id;
+
+		/* This call also serves as a whitelist of allowed objects that
+		 * can be used as a referenced pointer and be stored in a map at
+		 * the same time.
+		 */
+		dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+		if (dtor_btf_id < 0) {
+			ret = dtor_btf_id;
+			goto end_btf;
+		}
 
-int btf_find_timer(const struct btf *btf, const struct btf_type *t)
-{
-	struct btf_field_info info;
-	int ret;
+		dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+		if (!dtor_func) {
+			ret = -ENOENT;
+			goto end_btf;
+		}
 
-	ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1);
-	if (ret < 0)
-		return ret;
-	if (!ret)
-		return -ENOENT;
-	return info.off;
+		if (btf_is_module(kernel_btf)) {
+			mod = btf_try_get_module(kernel_btf);
+			if (!mod) {
+				ret = -ENXIO;
+				goto end_btf;
+			}
+		}
+
+		/* We already verified dtor_func to be btf_type_is_func
+		 * in register_btf_id_dtor_kfuncs.
+		 */
+		dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+		addr = kallsyms_lookup_name(dtor_func_name);
+		if (!addr) {
+			ret = -EINVAL;
+			goto end_mod;
+		}
+		field->kptr.dtor = (void *)addr;
+	}
+
+	field->kptr.btf_id = id;
+	field->kptr.btf = kernel_btf;
+	field->kptr.module = mod;
+	return 0;
+end_mod:
+	module_put(mod);
+end_btf:
+	btf_put(kernel_btf);
+	return ret;
 }
 
-struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t)
+struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
+				    u32 field_mask, u32 value_size)
 {
 	struct btf_field_info info_arr[BTF_FIELDS_MAX];
-	struct btf *kernel_btf = NULL;
-	struct module *mod = NULL;
 	struct btf_record *rec;
 	int ret, i, cnt;
 
-	ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr));
+	ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
 	if (ret < 0)
 		return ERR_PTR(ret);
 	if (!ret)
@@ -3441,80 +3508,44 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 	rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN);
 	if (!rec)
 		return ERR_PTR(-ENOMEM);
-	rec->cnt = 0;
-	for (i = 0; i < cnt; i++) {
-		const struct btf_type *t;
-		s32 id;
 
-		/* Find type in map BTF, and use it to look up the matching type
-		 * in vmlinux or module BTFs, by name and kind.
-		 */
-		t = btf_type_by_id(btf, info_arr[i].type_id);
-		id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
-				     &kernel_btf);
-		if (id < 0) {
-			ret = id;
+	rec->spin_lock_off = -EINVAL;
+	rec->timer_off = -EINVAL;
+	for (i = 0; i < cnt; i++) {
+		if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) {
+			WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
+			ret = -EFAULT;
 			goto end;
 		}
 
-		/* Find and stash the function pointer for the destruction function that
-		 * needs to be eventually invoked from the map free path.
-		 */
-		if (info_arr[i].type == BPF_KPTR_REF) {
-			const struct btf_type *dtor_func;
-			const char *dtor_func_name;
-			unsigned long addr;
-			s32 dtor_btf_id;
-
-			/* This call also serves as a whitelist of allowed objects that
-			 * can be used as a referenced pointer and be stored in a map at
-			 * the same time.
-			 */
-			dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
-			if (dtor_btf_id < 0) {
-				ret = dtor_btf_id;
-				goto end_btf;
-			}
-
-			dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
-			if (!dtor_func) {
-				ret = -ENOENT;
-				goto end_btf;
-			}
-
-			if (btf_is_module(kernel_btf)) {
-				mod = btf_try_get_module(kernel_btf);
-				if (!mod) {
-					ret = -ENXIO;
-					goto end_btf;
-				}
-			}
-
-			/* We already verified dtor_func to be btf_type_is_func
-			 * in register_btf_id_dtor_kfuncs.
-			 */
-			dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
-			addr = kallsyms_lookup_name(dtor_func_name);
-			if (!addr) {
-				ret = -EINVAL;
-				goto end_mod;
-			}
-			rec->fields[i].kptr.dtor = (void *)addr;
-		}
-
 		rec->field_mask |= info_arr[i].type;
 		rec->fields[i].offset = info_arr[i].off;
 		rec->fields[i].type = info_arr[i].type;
-		rec->fields[i].kptr.btf_id = id;
-		rec->fields[i].kptr.btf = kernel_btf;
-		rec->fields[i].kptr.module = mod;
+
+		switch (info_arr[i].type) {
+		case BPF_SPIN_LOCK:
+			WARN_ON_ONCE(rec->spin_lock_off >= 0);
+			/* Cache offset for faster lookup at runtime */
+			rec->spin_lock_off = rec->fields[i].offset;
+			break;
+		case BPF_TIMER:
+			WARN_ON_ONCE(rec->timer_off >= 0);
+			/* Cache offset for faster lookup at runtime */
+			rec->timer_off = rec->fields[i].offset;
+			break;
+		case BPF_KPTR_UNREF:
+		case BPF_KPTR_REF:
+			ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
+			if (ret < 0)
+				goto end;
+			break;
+		default:
+			ret = -EFAULT;
+			goto end;
+		}
 		rec->cnt++;
 	}
 	return rec;
-end_mod:
-	module_put(mod);
-end_btf:
-	btf_put(kernel_btf);
 end:
 	btf_record_free(rec);
 	return ERR_PTR(ret);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c5ea8f9bb7a9..50d254cd0709 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -222,7 +222,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
 	u32 num_entries = htab->map.max_entries;
 	int i;
 
-	if (!map_value_has_timer(&htab->map))
+	if (!btf_record_has_field(htab->map.record, BPF_TIMER))
 		return;
 	if (htab_has_extra_elems(htab))
 		num_entries += num_possible_cpus();
@@ -231,9 +231,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
 		struct htab_elem *elem;
 
 		elem = get_htab_elem(htab, i);
-		bpf_timer_cancel_and_free(elem->key +
-					  round_up(htab->map.key_size, 8) +
-					  htab->map.timer_off);
+		bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
 		cond_resched();
 	}
 }
@@ -763,8 +761,6 @@ static void check_and_free_fields(struct bpf_htab *htab,
 {
 	void *map_value = elem->key + round_up(htab->map.key_size, 8);
 
-	if (map_value_has_timer(&htab->map))
-		bpf_timer_cancel_and_free(map_value + htab->map.timer_off);
 	bpf_obj_free_fields(htab->map.record, map_value);
 }
 
@@ -1089,7 +1085,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	head = &b->head;
 
 	if (unlikely(map_flags & BPF_F_LOCK)) {
-		if (unlikely(!map_value_has_spin_lock(map)))
+		if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
 			return -EINVAL;
 		/* find an element without taking the bucket lock */
 		l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
@@ -1472,12 +1468,8 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
 		struct htab_elem *l;
 
 		hlist_nulls_for_each_entry(l, n, head, hash_node) {
-			/* We don't reset or free kptr on uref dropping to zero,
-			 * hence just free timer.
-			 */
-			bpf_timer_cancel_and_free(l->key +
-						  round_up(htab->map.key_size, 8) +
-						  htab->map.timer_off);
+			/* We only free timer on uref dropping to zero */
+			bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
 		}
 		cond_resched_rcu();
 	}
@@ -1488,8 +1480,8 @@ static void htab_map_free_timers(struct bpf_map *map)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 
-	/* We don't reset or free kptr on uref dropping to zero. */
-	if (!map_value_has_timer(&htab->map))
+	/* We only free timer on uref dropping to zero */
+	if (!btf_record_has_field(htab->map.record, BPF_TIMER))
 		return;
 	if (!htab_is_prealloc(htab))
 		htab_free_malloced_timers(htab);
@@ -1673,7 +1665,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 
 	elem_map_flags = attr->batch.elem_flags;
 	if ((elem_map_flags & ~BPF_F_LOCK) ||
-	    ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+	    ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
 		return -EINVAL;
 
 	map_flags = attr->batch.flags;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 124fd199ce5c..283f55bbeb70 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -366,9 +366,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
 	struct bpf_spin_lock *lock;
 
 	if (lock_src)
-		lock = src + map->spin_lock_off;
+		lock = src + map->record->spin_lock_off;
 	else
-		lock = dst + map->spin_lock_off;
+		lock = dst + map->record->spin_lock_off;
 	preempt_disable();
 	__bpf_spin_lock_irqsave(lock);
 	copy_map_value(map, dst, src);
@@ -1169,7 +1169,7 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
 		ret = -ENOMEM;
 		goto out;
 	}
-	t->value = (void *)timer - map->timer_off;
+	t->value = (void *)timer - map->record->timer_off;
 	t->map = map;
 	t->prog = NULL;
 	rcu_assign_pointer(t->callback_fn, NULL);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 098cf336fae6..e90d9f63edc5 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -151,7 +151,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
 		return -EINVAL;
 
 	if (unlikely((flags & BPF_F_LOCK) &&
-		     !map_value_has_spin_lock(map)))
+		     !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
 		return -EINVAL;
 
 	storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index d6c662183f88..8ca0cca39d49 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -29,7 +29,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 		return ERR_PTR(-ENOTSUPP);
 	}
 
-	if (map_value_has_spin_lock(inner_map)) {
+	if (btf_record_has_field(inner_map->record, BPF_SPIN_LOCK)) {
 		fdput(f);
 		return ERR_PTR(-ENOTSUPP);
 	}
@@ -50,8 +50,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 	inner_map_meta->value_size = inner_map->value_size;
 	inner_map_meta->map_flags = inner_map->map_flags;
 	inner_map_meta->max_entries = inner_map->max_entries;
-	inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
-	inner_map_meta->timer_off = inner_map->timer_off;
 	inner_map_meta->record = btf_record_dup(inner_map->record);
 	if (IS_ERR(inner_map_meta->record)) {
 		/* btf_record_dup returns NULL or valid pointer in case of
@@ -92,7 +90,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
 	return meta0->map_type == meta1->map_type &&
 		meta0->key_size == meta1->key_size &&
 		meta0->value_size == meta1->value_size &&
-		meta0->timer_off == meta1->timer_off &&
 		meta0->map_flags == meta1->map_flags &&
 		btf_record_equal(meta0->record, meta1->record);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b80c0e2eb73f..53d6dc5cf0e2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -527,6 +527,9 @@ void btf_record_free(struct btf_record *rec)
 		return;
 	for (i = 0; i < rec->cnt; i++) {
 		switch (rec->fields[i].type) {
+		case BPF_SPIN_LOCK:
+		case BPF_TIMER:
+			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
 			if (rec->fields[i].kptr.module)
@@ -564,6 +567,9 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 	new_rec->cnt = 0;
 	for (i = 0; i < rec->cnt; i++) {
 		switch (fields[i].type) {
+		case BPF_SPIN_LOCK:
+		case BPF_TIMER:
+			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
 			btf_get(fields[i].kptr.btf);
@@ -600,6 +606,13 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r
 	return !memcmp(rec_a, rec_b, size);
 }
 
+void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
+{
+	if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
+		return;
+	bpf_timer_cancel_and_free(obj + rec->timer_off);
+}
+
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 {
 	const struct btf_field *fields;
@@ -613,6 +626,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 		void *field_ptr = obj + field->offset;
 
 		switch (fields[i].type) {
+		case BPF_SPIN_LOCK:
+			break;
+		case BPF_TIMER:
+			bpf_timer_cancel_and_free(field_ptr);
+			break;
 		case BPF_KPTR_UNREF:
 			WRITE_ONCE(*(u64 *)field_ptr, 0);
 			break;
@@ -798,8 +816,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
 	struct bpf_map *map = filp->private_data;
 	int err;
 
-	if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
-	    map_value_has_timer(map) || !IS_ERR_OR_NULL(map->record))
+	if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
 		return -ENOTSUPP;
 
 	if (!(vma->vm_flags & VM_SHARED))
@@ -954,48 +971,30 @@ static void map_field_offs_swap(void *_a, void *_b, int size, const void *priv)
 
 static int bpf_map_alloc_off_arr(struct bpf_map *map)
 {
-	bool has_spin_lock = map_value_has_spin_lock(map);
-	bool has_timer = map_value_has_timer(map);
 	bool has_fields = !IS_ERR_OR_NULL(map->record);
 	struct btf_field_offs *fo;
-	u32 i;
+	struct btf_record *rec;
+	u32 i, *off;
+	u8 *sz;
 
-	if (!has_spin_lock && !has_timer && !has_fields) {
+	if (!has_fields) {
 		map->field_offs = NULL;
 		return 0;
 	}
 
-	fo = kmalloc(sizeof(*map->field_offs), GFP_KERNEL | __GFP_NOWARN);
+	fo = kzalloc(sizeof(*map->field_offs), GFP_KERNEL | __GFP_NOWARN);
 	if (!fo)
 		return -ENOMEM;
 	map->field_offs = fo;
 
-	fo->cnt = 0;
-	if (has_spin_lock) {
-		i = fo->cnt;
-
-		fo->field_off[i] = map->spin_lock_off;
-		fo->field_sz[i] = sizeof(struct bpf_spin_lock);
-		fo->cnt++;
-	}
-	if (has_timer) {
-		i = fo->cnt;
-
-		fo->field_off[i] = map->timer_off;
-		fo->field_sz[i] = sizeof(struct bpf_timer);
-		fo->cnt++;
-	}
-	if (has_fields) {
-		struct btf_record *rec = map->record;
-		u32 *off = &fo->field_off[fo->cnt];
-		u8 *sz = &fo->field_sz[fo->cnt];
-
-		for (i = 0; i < rec->cnt; i++) {
-			*off++ = rec->fields[i].offset;
-			*sz++ = btf_field_type_size(rec->fields[i].type);
-		}
-		fo->cnt += rec->cnt;
+	rec = map->record;
+	off = fo->field_off;
+	sz = fo->field_sz;
+	for (i = 0; i < rec->cnt; i++) {
+		*off++ = rec->fields[i].offset;
+		*sz++ = btf_field_type_size(rec->fields[i].type);
 	}
+	fo->cnt = rec->cnt;
 
 	if (fo->cnt == 1)
 		return 0;
@@ -1026,39 +1025,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 	if (!value_type || value_size != map->value_size)
 		return -EINVAL;
 
-	map->spin_lock_off = btf_find_spin_lock(btf, value_type);
-
-	if (map_value_has_spin_lock(map)) {
-		if (map->map_flags & BPF_F_RDONLY_PROG)
-			return -EACCES;
-		if (map->map_type != BPF_MAP_TYPE_HASH &&
-		    map->map_type != BPF_MAP_TYPE_ARRAY &&
-		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
-		    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
-		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
-		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
-		    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
-			return -ENOTSUPP;
-		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
-		    map->value_size) {
-			WARN_ONCE(1,
-				  "verifier bug spin_lock_off %d value_size %d\n",
-				  map->spin_lock_off, map->value_size);
-			return -EFAULT;
-		}
-	}
-
-	map->timer_off = btf_find_timer(btf, value_type);
-	if (map_value_has_timer(map)) {
-		if (map->map_flags & BPF_F_RDONLY_PROG)
-			return -EACCES;
-		if (map->map_type != BPF_MAP_TYPE_HASH &&
-		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
-		    map->map_type != BPF_MAP_TYPE_ARRAY)
-			return -EOPNOTSUPP;
-	}
-
-	map->record = btf_parse_fields(btf, value_type);
+	map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR,
+				       map->value_size);
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
 
@@ -1074,6 +1042,26 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 			switch (map->record->field_mask & (1 << i)) {
 			case 0:
 				continue;
+			case BPF_SPIN_LOCK:
+				if (map->map_type != BPF_MAP_TYPE_HASH &&
+				    map->map_type != BPF_MAP_TYPE_ARRAY &&
+				    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+				    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+				    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+				    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+				    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+					ret = -EOPNOTSUPP;
+					goto free_map_tab;
+				}
+				break;
+			case BPF_TIMER:
+				if (map->map_type != BPF_MAP_TYPE_HASH &&
+				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+				    map->map_type != BPF_MAP_TYPE_ARRAY) {
+					return -EOPNOTSUPP;
+					goto free_map_tab;
+				}
+				break;
 			case BPF_KPTR_UNREF:
 			case BPF_KPTR_REF:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
@@ -1153,8 +1141,6 @@ static int map_create(union bpf_attr *attr)
 	mutex_init(&map->freeze_mutex);
 	spin_lock_init(&map->owner.lock);
 
-	map->spin_lock_off = -EINVAL;
-	map->timer_off = -EINVAL;
 	if (attr->btf_key_type_id || attr->btf_value_type_id ||
 	    /* Even the map's value is a kernel's struct,
 	     * the bpf_prog.o must have BTF to begin with
@@ -1368,7 +1354,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	}
 
 	if ((attr->flags & BPF_F_LOCK) &&
-	    !map_value_has_spin_lock(map)) {
+	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
 		err = -EINVAL;
 		goto err_put;
 	}
@@ -1441,7 +1427,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 	}
 
 	if ((attr->flags & BPF_F_LOCK) &&
-	    !map_value_has_spin_lock(map)) {
+	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
 		err = -EINVAL;
 		goto err_put;
 	}
@@ -1604,7 +1590,7 @@ int generic_map_delete_batch(struct bpf_map *map,
 		return -EINVAL;
 
 	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
-	    !map_value_has_spin_lock(map)) {
+	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
 		return -EINVAL;
 	}
 
@@ -1661,7 +1647,7 @@ int generic_map_update_batch(struct bpf_map *map,
 		return -EINVAL;
 
 	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
-	    !map_value_has_spin_lock(map)) {
+	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
 		return -EINVAL;
 	}
 
@@ -1724,7 +1710,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
 		return -EINVAL;
 
 	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
-	    !map_value_has_spin_lock(map))
+	    !btf_record_has_field(map->record, BPF_SPIN_LOCK))
 		return -EINVAL;
 
 	value_size = bpf_map_value_size(map);
@@ -1846,7 +1832,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 	}
 
 	if ((attr->flags & BPF_F_LOCK) &&
-	    !map_value_has_spin_lock(map)) {
+	    !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
 		err = -EINVAL;
 		goto err_put;
 	}
@@ -1917,8 +1903,7 @@ static int map_freeze(const union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
-	    map_value_has_timer(map) || !IS_ERR_OR_NULL(map->record)) {
+	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
 		fdput(f);
 		return -ENOTSUPP;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5ce5364ce898..73a3516f1a48 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -454,7 +454,7 @@ static bool reg_type_not_null(enum bpf_reg_type type)
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
 {
 	return reg->type == PTR_TO_MAP_VALUE &&
-		map_value_has_spin_lock(reg->map_ptr);
+	       btf_record_has_field(reg->map_ptr->record, BPF_SPIN_LOCK);
 }
 
 static bool type_is_rdonly_mem(u32 type)
@@ -1388,7 +1388,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
 			/* transfer reg's id which is unique for every map_lookup_elem
 			 * as UID of the inner map.
 			 */
-			if (map_value_has_timer(map->inner_map_meta))
+			if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
 				reg->map_uid = reg->id;
 		} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
 			reg->type = PTR_TO_XDP_SOCK;
@@ -3817,29 +3817,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	if (err)
 		return err;
 
-	if (map_value_has_spin_lock(map)) {
-		u32 lock = map->spin_lock_off;
-
-		/* if any part of struct bpf_spin_lock can be touched by
-		 * load/store reject this program.
-		 * To check that [x1, x2) overlaps with [y1, y2)
-		 * it is sufficient to check x1 < y2 && y1 < x2.
-		 */
-		if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
-		     lock < reg->umax_value + off + size) {
-			verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
-			return -EACCES;
-		}
-	}
-	if (map_value_has_timer(map)) {
-		u32 t = map->timer_off;
-
-		if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
-		     t < reg->umax_value + off + size) {
-			verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
-			return -EACCES;
-		}
-	}
 	if (IS_ERR_OR_NULL(map->record))
 		return 0;
 	rec = map->record;
@@ -3847,6 +3824,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 		struct btf_field *field = &rec->fields[i];
 		u32 p = field->offset;
 
+		/* If any part of a field  can be touched by load/store, reject
+		 * this program. To check that [x1, x2) overlaps with [y1, y2),
+		 * it is sufficient to check x1 < y2 && y1 < x2.
+		 */
 		if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
 		    p < reg->umax_value + off + size) {
 			switch (field->type) {
@@ -3871,7 +3852,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 				}
 				break;
 			default:
-				verbose(env, "field cannot be accessed directly by load/store\n");
+				verbose(env, "%s cannot be accessed directly by load/store\n",
+					btf_field_type_name(field->type));
 				return -EACCES;
 			}
 		}
@@ -5440,24 +5422,13 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 			map->name);
 		return -EINVAL;
 	}
-	if (!map_value_has_spin_lock(map)) {
-		if (map->spin_lock_off == -E2BIG)
-			verbose(env,
-				"map '%s' has more than one 'struct bpf_spin_lock'\n",
-				map->name);
-		else if (map->spin_lock_off == -ENOENT)
-			verbose(env,
-				"map '%s' doesn't have 'struct bpf_spin_lock'\n",
-				map->name);
-		else
-			verbose(env,
-				"map '%s' is not a struct type or bpf_spin_lock is mangled\n",
-				map->name);
+	if (!btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
+		verbose(env, "map '%s' has no valid bpf_spin_lock\n", map->name);
 		return -EINVAL;
 	}
-	if (map->spin_lock_off != val + reg->off) {
-		verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
-			val + reg->off);
+	if (map->record->spin_lock_off != val + reg->off) {
+		verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n",
+			val + reg->off, map->record->spin_lock_off);
 		return -EINVAL;
 	}
 	if (is_lock) {
@@ -5500,24 +5471,13 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
 			map->name);
 		return -EINVAL;
 	}
-	if (!map_value_has_timer(map)) {
-		if (map->timer_off == -E2BIG)
-			verbose(env,
-				"map '%s' has more than one 'struct bpf_timer'\n",
-				map->name);
-		else if (map->timer_off == -ENOENT)
-			verbose(env,
-				"map '%s' doesn't have 'struct bpf_timer'\n",
-				map->name);
-		else
-			verbose(env,
-				"map '%s' is not a struct type or bpf_timer is mangled\n",
-				map->name);
+	if (!btf_record_has_field(map->record, BPF_TIMER)) {
+		verbose(env, "map '%s' has no valid bpf_timer\n", map->name);
 		return -EINVAL;
 	}
-	if (map->timer_off != val + reg->off) {
+	if (map->record->timer_off != val + reg->off) {
 		verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
-			val + reg->off, map->timer_off);
+			val + reg->off, map->record->timer_off);
 		return -EINVAL;
 	}
 	if (meta->map_ptr) {
@@ -7470,7 +7430,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		regs[BPF_REG_0].map_uid = meta.map_uid;
 		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
 		if (!type_may_be_null(ret_type) &&
-		    map_value_has_spin_lock(meta.map_ptr)) {
+		    btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
 			regs[BPF_REG_0].id = ++env->id_gen;
 		}
 		break;
@@ -10381,7 +10341,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	    insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
 		dst_reg->type = PTR_TO_MAP_VALUE;
 		dst_reg->off = aux->map_off;
-		if (map_value_has_spin_lock(map))
+		if (btf_record_has_field(map->record, BPF_SPIN_LOCK))
 			dst_reg->id = ++env->id_gen;
 	} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
 		   insn->src_reg == BPF_PSEUDO_MAP_IDX) {
@@ -12659,7 +12619,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(prog);
 
-	if (map_value_has_spin_lock(map)) {
+	if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
 		if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
 			verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
 			return -EINVAL;
@@ -12676,7 +12636,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 		}
 	}
 
-	if (map_value_has_timer(map)) {
+	if (btf_record_has_field(map->record, BPF_TIMER)) {
 		if (is_tracing_prog_type(prog_type)) {
 			verbose(env, "tracing progs cannot use bpf_timer yet\n");
 			return -EINVAL;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 49884e7de080..9d2288c0736e 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -147,7 +147,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
 	if (!copy_selem)
 		return NULL;
 
-	if (map_value_has_spin_lock(&smap->map))
+	if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))
 		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
 				      SDATA(selem)->data, true);
 	else
@@ -566,7 +566,7 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
 	if (!nla_value)
 		goto errout;
 
-	if (map_value_has_spin_lock(&smap->map))
+	if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))
 		copy_map_value_locked(&smap->map, nla_data(nla_value),
 				      sdata->data, true);
 	else
-- 
cgit v1.2.3


From f71b2f64177a199d5b1d2047e155d45fd98f564a Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 4 Nov 2022 00:39:57 +0530
Subject: bpf: Refactor map->off_arr handling

Refactor map->off_arr handling into generic functions that can work on
their own without hardcoding map specific code. The btf_fields_offs
structure is now returned from btf_parse_field_offs, which can be reused
later for types in program BTF.

All functions like copy_map_value, zero_map_value call generic
underlying functions so that they can also be reused later for copying
to values allocated in programs which encode specific fields.

Later, some helper functions will also require access to this
btf_field_offs structure to be able to skip over special fields at
runtime.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221103191013.1236066-9-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 41 +++++++++++++++++------------
 include/linux/btf.h  |  1 +
 kernel/bpf/btf.c     | 55 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c | 73 +++++++---------------------------------------------
 4 files changed, 89 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index aae85019abde..798aec816970 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -341,57 +341,64 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 }
 
 /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */
-static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, bool long_memcpy)
+static inline void bpf_obj_memcpy(struct btf_field_offs *foffs,
+				  void *dst, void *src, u32 size,
+				  bool long_memcpy)
 {
 	u32 curr_off = 0;
 	int i;
 
-	if (likely(!map->field_offs)) {
+	if (likely(!foffs)) {
 		if (long_memcpy)
-			bpf_long_memcpy(dst, src, round_up(map->value_size, 8));
+			bpf_long_memcpy(dst, src, round_up(size, 8));
 		else
-			memcpy(dst, src, map->value_size);
+			memcpy(dst, src, size);
 		return;
 	}
 
-	for (i = 0; i < map->field_offs->cnt; i++) {
-		u32 next_off = map->field_offs->field_off[i];
+	for (i = 0; i < foffs->cnt; i++) {
+		u32 next_off = foffs->field_off[i];
 		u32 sz = next_off - curr_off;
 
 		memcpy(dst + curr_off, src + curr_off, sz);
-		curr_off += map->field_offs->field_sz[i];
+		curr_off += foffs->field_sz[i];
 	}
-	memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off);
+	memcpy(dst + curr_off, src + curr_off, size - curr_off);
 }
 
 static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 {
-	__copy_map_value(map, dst, src, false);
+	bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, false);
 }
 
 static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src)
 {
-	__copy_map_value(map, dst, src, true);
+	bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, true);
 }
 
-static inline void zero_map_value(struct bpf_map *map, void *dst)
+static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 size)
 {
 	u32 curr_off = 0;
 	int i;
 
-	if (likely(!map->field_offs)) {
-		memset(dst, 0, map->value_size);
+	if (likely(!foffs)) {
+		memset(dst, 0, size);
 		return;
 	}
 
-	for (i = 0; i < map->field_offs->cnt; i++) {
-		u32 next_off = map->field_offs->field_off[i];
+	for (i = 0; i < foffs->cnt; i++) {
+		u32 next_off = foffs->field_off[i];
 		u32 sz = next_off - curr_off;
 
 		memset(dst + curr_off, 0, sz);
-		curr_off += map->field_offs->field_sz[i];
+		curr_off += foffs->field_sz[i];
 	}
-	memset(dst + curr_off, 0, map->value_size - curr_off);
+	memset(dst + curr_off, 0, size - curr_off);
+}
+
+static inline void zero_map_value(struct bpf_map *map, void *dst)
+{
+	bpf_obj_memzero(map->field_offs, dst, map->value_size);
 }
 
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 282006abd062..d80345fa566b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -165,6 +165,7 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
 int btf_find_timer(const struct btf *btf, const struct btf_type *t);
 struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
 				    u32 field_mask, u32 value_size);
+struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec);
 bool btf_type_is_void(const struct btf_type *t);
 s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
 const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3dad828db13c..5579ff3a5b54 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3551,6 +3551,61 @@ end:
 	return ERR_PTR(ret);
 }
 
+static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv)
+{
+	const u32 a = *(const u32 *)_a;
+	const u32 b = *(const u32 *)_b;
+
+	if (a < b)
+		return -1;
+	else if (a > b)
+		return 1;
+	return 0;
+}
+
+static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv)
+{
+	struct btf_field_offs *foffs = (void *)priv;
+	u32 *off_base = foffs->field_off;
+	u32 *a = _a, *b = _b;
+	u8 *sz_a, *sz_b;
+
+	sz_a = foffs->field_sz + (a - off_base);
+	sz_b = foffs->field_sz + (b - off_base);
+
+	swap(*a, *b);
+	swap(*sz_a, *sz_b);
+}
+
+struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec)
+{
+	struct btf_field_offs *foffs;
+	u32 i, *off;
+	u8 *sz;
+
+	BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz));
+	if (IS_ERR_OR_NULL(rec) || WARN_ON_ONCE(rec->cnt > sizeof(foffs->field_off)))
+		return NULL;
+
+	foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN);
+	if (!foffs)
+		return ERR_PTR(-ENOMEM);
+
+	off = foffs->field_off;
+	sz = foffs->field_sz;
+	for (i = 0; i < rec->cnt; i++) {
+		off[i] = rec->fields[i].offset;
+		sz[i] = btf_field_type_size(rec->fields[i].type);
+	}
+	foffs->cnt = rec->cnt;
+
+	if (foffs->cnt == 1)
+		return foffs;
+	sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]),
+	       btf_field_offs_cmp, btf_field_offs_swap, foffs);
+	return foffs;
+}
+
 static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
 			      u32 type_id, void *data, u8 bits_offset,
 			      struct btf_show *show)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 53d6dc5cf0e2..85532d301124 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -943,66 +943,6 @@ int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
-static int map_field_offs_cmp(const void *_a, const void *_b, const void *priv)
-{
-	const u32 a = *(const u32 *)_a;
-	const u32 b = *(const u32 *)_b;
-
-	if (a < b)
-		return -1;
-	else if (a > b)
-		return 1;
-	return 0;
-}
-
-static void map_field_offs_swap(void *_a, void *_b, int size, const void *priv)
-{
-	struct bpf_map *map = (struct bpf_map *)priv;
-	u32 *off_base = map->field_offs->field_off;
-	u32 *a = _a, *b = _b;
-	u8 *sz_a, *sz_b;
-
-	sz_a = map->field_offs->field_sz + (a - off_base);
-	sz_b = map->field_offs->field_sz + (b - off_base);
-
-	swap(*a, *b);
-	swap(*sz_a, *sz_b);
-}
-
-static int bpf_map_alloc_off_arr(struct bpf_map *map)
-{
-	bool has_fields = !IS_ERR_OR_NULL(map->record);
-	struct btf_field_offs *fo;
-	struct btf_record *rec;
-	u32 i, *off;
-	u8 *sz;
-
-	if (!has_fields) {
-		map->field_offs = NULL;
-		return 0;
-	}
-
-	fo = kzalloc(sizeof(*map->field_offs), GFP_KERNEL | __GFP_NOWARN);
-	if (!fo)
-		return -ENOMEM;
-	map->field_offs = fo;
-
-	rec = map->record;
-	off = fo->field_off;
-	sz = fo->field_sz;
-	for (i = 0; i < rec->cnt; i++) {
-		*off++ = rec->fields[i].offset;
-		*sz++ = btf_field_type_size(rec->fields[i].type);
-	}
-	fo->cnt = rec->cnt;
-
-	if (fo->cnt == 1)
-		return 0;
-	sort_r(fo->field_off, fo->cnt, sizeof(fo->field_off[0]),
-	       map_field_offs_cmp, map_field_offs_swap, map);
-	return 0;
-}
-
 static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 			 u32 btf_key_id, u32 btf_value_id)
 {
@@ -1097,6 +1037,7 @@ free_map_tab:
 static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
+	struct btf_field_offs *foffs;
 	struct bpf_map *map;
 	int f_flags;
 	int err;
@@ -1176,13 +1117,17 @@ static int map_create(union bpf_attr *attr)
 			attr->btf_vmlinux_value_type_id;
 	}
 
-	err = bpf_map_alloc_off_arr(map);
-	if (err)
+
+	foffs = btf_parse_field_offs(map->record);
+	if (IS_ERR(foffs)) {
+		err = PTR_ERR(foffs);
 		goto free_map;
+	}
+	map->field_offs = foffs;
 
 	err = security_bpf_map_alloc(map);
 	if (err)
-		goto free_map_off_arr;
+		goto free_map_field_offs;
 
 	err = bpf_map_alloc_id(map);
 	if (err)
@@ -1206,7 +1151,7 @@ static int map_create(union bpf_attr *attr)
 
 free_map_sec:
 	security_bpf_map_free(map);
-free_map_off_arr:
+free_map_field_offs:
 	kfree(map->field_offs);
 free_map:
 	btf_put(map->btf);
-- 
cgit v1.2.3


From 2b6c0e152868c9c5939a5c5094d5b2be61cf48e6 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 28 Oct 2022 11:23:32 +0200
Subject: bcma: Use the proper gpio include

The <linux/bcma/bcma_driver_chipcommon.h> is including the legacy
header <linux/gpio.h> to obtain struct gpio_chip. Instead, include
<linux/gpio/driver.h> where this struct is defined.

It turns out that the brcm80211 brcmsmac depends on this to
bring in the symbol gpio_is_valid().

The driver looks up the BCMA parent GPIO driver and checks that
this succeeds, but then it goes on to use the deprecated GPIO
call gpio_is_valid() to check the consistency of the .base
member of the BCMA GPIO struct. The whole check can be dropped
because the bcma_gpio is initialized in the declarations:

  struct gpio_chip *bcma_gpio = &cc_drv->gpio;

And this can never be NULL.

Cc: Jonas Gorski <jonas.gorski@gmail.com>
Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20221028092332.238728-1-linus.walleij@linaro.org
---
 drivers/net/wireless/broadcom/brcm80211/brcmsmac/led.c | 3 ---
 include/linux/bcma/bcma_driver_chipcommon.h            | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/led.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/led.c
index c1b9ac692d26..9540a05247c2 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/led.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/led.c
@@ -63,9 +63,6 @@ int brcms_led_register(struct brcms_info *wl)
 	int hwnum = -1;
 	enum gpio_lookup_flags lflags = GPIO_ACTIVE_HIGH;
 
-	if (!bcma_gpio || !gpio_is_valid(bcma_gpio->base))
-		return -ENODEV;
-
 	/* find radio enabled LED */
 	for (i = 0; i < BRCMS_LED_NO; i++) {
 		u8 led = *leds[i];
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 2d94c30ed439..0cb6638b55e5 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -4,7 +4,7 @@
 
 #include <linux/platform_device.h>
 #include <linux/platform_data/brcmnand.h>
-#include <linux/gpio.h>
+#include <linux/gpio/driver.h>
 
 /** ChipCommon core registers. **/
 #define BCMA_CC_ID			0x0000
-- 
cgit v1.2.3


From d08cb25556773c65efb21761b75f92c804ad0975 Mon Sep 17 00:00:00 2001
From: David Yang <mmyangfl@gmail.com>
Date: Fri, 28 Oct 2022 10:01:01 +0800
Subject: net: mv643xx_eth: support MII/GMII/RGMII modes for Kirkwood

Support mode switch properly, which is not available before.

If SoC has two Ethernet controllers, by setting both of them into MII
mode, the first controller enters GMII mode, while the second
controller is effectively disabled. This requires configuring (and
maybe enabling) the second controller in the device tree, even though
it cannot be used.

Signed-off-by: David Yang <mmyangfl@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mv643xx_eth.c | 49 +++++++++++++++++++++++++-----
 include/linux/mv643xx_eth.h                |  2 ++
 2 files changed, 44 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 707993b445d1..4389a33a2ea6 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -108,6 +108,7 @@ static char mv643xx_eth_driver_version[] = "1.4";
 #define TXQ_COMMAND			0x0048
 #define TXQ_FIX_PRIO_CONF		0x004c
 #define PORT_SERIAL_CONTROL1		0x004c
+#define  RGMII_EN			0x00000008
 #define  CLK125_BYPASS_EN		0x00000010
 #define TX_BW_RATE			0x0050
 #define TX_BW_MTU			0x0058
@@ -2761,6 +2762,8 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev,
 	mv643xx_eth_property(pnp, "rx-sram-addr", ppd.rx_sram_addr);
 	mv643xx_eth_property(pnp, "rx-sram-size", ppd.rx_sram_size);
 
+	of_get_phy_mode(pnp, &ppd.interface);
+
 	ppd.phy_node = of_parse_phandle(pnp, "phy-handle", 0);
 	if (!ppd.phy_node) {
 		ppd.phy_addr = MV643XX_ETH_PHY_NONE;
@@ -3092,6 +3095,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 	struct mv643xx_eth_private *mp;
 	struct net_device *dev;
 	struct phy_device *phydev = NULL;
+	u32 psc1r;
 	int err, irq;
 
 	pd = dev_get_platdata(&pdev->dev);
@@ -3119,14 +3123,45 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 
 	mp->dev = dev;
 
-	/* Kirkwood resets some registers on gated clocks. Especially
-	 * CLK125_BYPASS_EN must be cleared but is not available on
-	 * all other SoCs/System Controllers using this driver.
-	 */
 	if (of_device_is_compatible(pdev->dev.of_node,
-				    "marvell,kirkwood-eth-port"))
-		wrlp(mp, PORT_SERIAL_CONTROL1,
-		     rdlp(mp, PORT_SERIAL_CONTROL1) & ~CLK125_BYPASS_EN);
+				    "marvell,kirkwood-eth-port")) {
+		psc1r = rdlp(mp, PORT_SERIAL_CONTROL1);
+
+		/* Kirkwood resets some registers on gated clocks. Especially
+		 * CLK125_BYPASS_EN must be cleared but is not available on
+		 * all other SoCs/System Controllers using this driver.
+		 */
+		psc1r &= ~CLK125_BYPASS_EN;
+
+		/* On Kirkwood with two Ethernet controllers, if both of them
+		 * have RGMII_EN disabled, the first controller will be in GMII
+		 * mode and the second one is effectively disabled, instead of
+		 * two MII interfaces.
+		 *
+		 * To enable GMII in the first controller, the second one must
+		 * also be configured (and may be enabled) with RGMII_EN
+		 * disabled too, even though it cannot be used at all.
+		 */
+		switch (pd->interface) {
+		/* Use internal to denote second controller being disabled */
+		case PHY_INTERFACE_MODE_INTERNAL:
+		case PHY_INTERFACE_MODE_MII:
+		case PHY_INTERFACE_MODE_GMII:
+			psc1r &= ~RGMII_EN;
+			break;
+		case PHY_INTERFACE_MODE_RGMII:
+		case PHY_INTERFACE_MODE_RGMII_ID:
+		case PHY_INTERFACE_MODE_RGMII_RXID:
+		case PHY_INTERFACE_MODE_RGMII_TXID:
+			psc1r |= RGMII_EN;
+			break;
+		default:
+			/* Unknown; don't touch */
+			break;
+		}
+
+		wrlp(mp, PORT_SERIAL_CONTROL1, psc1r);
+	}
 
 	/*
 	 * Start with a default rate, and if there is a clock, allow
diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h
index 3682ae75c7aa..145169be2ed8 100644
--- a/include/linux/mv643xx_eth.h
+++ b/include/linux/mv643xx_eth.h
@@ -8,6 +8,7 @@
 
 #include <linux/mbus.h>
 #include <linux/if_ether.h>
+#include <linux/phy.h>
 
 #define MV643XX_ETH_SHARED_NAME		"mv643xx_eth"
 #define MV643XX_ETH_NAME		"mv643xx_eth_port"
@@ -59,6 +60,7 @@ struct mv643xx_eth_platform_data {
 	 */
 	int			speed;
 	int			duplex;
+	phy_interface_t		interface;
 
 	/*
 	 * How many RX/TX queues to use.
-- 
cgit v1.2.3


From e1f4ecab19338ec7079830e8700e4869f991fd45 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 4 Nov 2022 17:13:01 +0000
Subject: net: remove explicit phylink_generic_validate() references

Virtually all conventional network drivers are now converted to use
phylink_generic_validate() - only DSA drivers and fman_memac remain,
so lets remove the necessity for network drivers to explicitly set
this member, and default to phylink_generic_validate() when unset.
This is possible as .validate must currently be set.

Any remaining instances that have not been addressed by this patch can
be fixed up later.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/E1or0FZ-001tRa-DI@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/altera/altera_tse_main.c            | 1 -
 drivers/net/ethernet/atheros/ag71xx.c                    | 1 -
 drivers/net/ethernet/cadence/macb_main.c                 | 1 -
 drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c         | 1 -
 drivers/net/ethernet/freescale/enetc/enetc_pf.c          | 1 -
 drivers/net/ethernet/freescale/fman/fman_dtsec.c         | 1 -
 drivers/net/ethernet/freescale/fman/fman_tgec.c          | 1 -
 drivers/net/ethernet/marvell/mvneta.c                    | 1 -
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c          | 1 -
 drivers/net/ethernet/marvell/prestera/prestera_main.c    | 1 -
 drivers/net/ethernet/mediatek/mtk_eth_soc.c              | 1 -
 drivers/net/ethernet/microchip/lan966x/lan966x_phylink.c | 1 -
 drivers/net/ethernet/microchip/sparx5/sparx5_phylink.c   | 1 -
 drivers/net/ethernet/mscc/ocelot_net.c                   | 1 -
 drivers/net/ethernet/renesas/rswitch.c                   | 1 -
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c        | 1 -
 drivers/net/ethernet/ti/am65-cpsw-nuss.c                 | 1 -
 drivers/net/ethernet/xilinx/xilinx_axienet_main.c        | 1 -
 drivers/net/phy/phylink.c                                | 5 ++++-
 drivers/net/usb/asix_devices.c                           | 1 -
 include/linux/phylink.h                                  | 5 +++++
 21 files changed, 9 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c
index 7633b227b2ca..28b5cae60eb5 100644
--- a/drivers/net/ethernet/altera/altera_tse_main.c
+++ b/drivers/net/ethernet/altera/altera_tse_main.c
@@ -1095,7 +1095,6 @@ static struct phylink_pcs *alt_tse_select_pcs(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops alt_tse_phylink_ops = {
-	.validate = phylink_generic_validate,
 	.mac_an_restart = alt_tse_mac_an_restart,
 	.mac_config = alt_tse_mac_config,
 	.mac_link_down = alt_tse_mac_link_down,
diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c
index cc932b3cf873..a5de1bd8538c 100644
--- a/drivers/net/ethernet/atheros/ag71xx.c
+++ b/drivers/net/ethernet/atheros/ag71xx.c
@@ -1086,7 +1086,6 @@ static void ag71xx_mac_link_up(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops ag71xx_phylink_mac_ops = {
-	.validate = phylink_generic_validate,
 	.mac_config = ag71xx_mac_config,
 	.mac_link_down = ag71xx_mac_link_down,
 	.mac_link_up = ag71xx_mac_link_up,
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 4f63f1ba3161..5d618814bb12 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -742,7 +742,6 @@ static struct phylink_pcs *macb_mac_select_pcs(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops macb_phylink_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = macb_mac_select_pcs,
 	.mac_config = macb_mac_config,
 	.mac_link_down = macb_mac_link_down,
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
index 2bbab28f763d..51c9da8e1be2 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
@@ -236,7 +236,6 @@ static void dpaa2_mac_link_down(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops dpaa2_mac_phylink_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = dpaa2_mac_select_pcs,
 	.mac_config = dpaa2_mac_config,
 	.mac_link_up = dpaa2_mac_link_up,
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
index bdf94335ee99..9f6c4f5c0a6c 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
@@ -1111,7 +1111,6 @@ static void enetc_pl_mac_link_down(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops enetc_mac_phylink_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = enetc_pl_mac_select_pcs,
 	.mac_config = enetc_pl_mac_config,
 	.mac_link_up = enetc_pl_mac_link_up,
diff --git a/drivers/net/ethernet/freescale/fman/fman_dtsec.c b/drivers/net/ethernet/freescale/fman/fman_dtsec.c
index 3c87820ca202..d00bae15a901 100644
--- a/drivers/net/ethernet/freescale/fman/fman_dtsec.c
+++ b/drivers/net/ethernet/freescale/fman/fman_dtsec.c
@@ -986,7 +986,6 @@ static void dtsec_link_down(struct phylink_config *config, unsigned int mode,
 }
 
 static const struct phylink_mac_ops dtsec_mac_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = dtsec_select_pcs,
 	.mac_config = dtsec_mac_config,
 	.mac_link_up = dtsec_link_up,
diff --git a/drivers/net/ethernet/freescale/fman/fman_tgec.c b/drivers/net/ethernet/freescale/fman/fman_tgec.c
index c265b7f19a4d..c2261d26db5b 100644
--- a/drivers/net/ethernet/freescale/fman/fman_tgec.c
+++ b/drivers/net/ethernet/freescale/fman/fman_tgec.c
@@ -469,7 +469,6 @@ static void tgec_link_down(struct phylink_config *config, unsigned int mode,
 }
 
 static const struct phylink_mac_ops tgec_mac_ops = {
-	.validate = phylink_generic_validate,
 	.mac_config = tgec_mac_config,
 	.mac_link_up = tgec_link_up,
 	.mac_link_down = tgec_link_down,
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 05ff3c58087e..c2cb98d24f5c 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4228,7 +4228,6 @@ static void mvneta_mac_link_up(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops mvneta_phylink_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = mvneta_mac_select_pcs,
 	.mac_prepare = mvneta_mac_prepare,
 	.mac_config = mvneta_mac_config,
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 116e53172072..d98f7e9a480e 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -6603,7 +6603,6 @@ static void mvpp2_mac_link_down(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops mvpp2_phylink_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = mvpp2_select_pcs,
 	.mac_prepare = mvpp2_mac_prepare,
 	.mac_config = mvpp2_mac_config,
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c
index edbdda8f958d..f8deaee84398 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_main.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c
@@ -360,7 +360,6 @@ static void prestera_pcs_an_restart(struct phylink_pcs *pcs)
 }
 
 static const struct phylink_mac_ops prestera_mac_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = prestera_mac_select_pcs,
 	.mac_config = prestera_mac_config,
 	.mac_link_down = prestera_mac_link_down,
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 789268b15106..1cf76fd7afbc 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -653,7 +653,6 @@ static void mtk_mac_link_up(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops mtk_phylink_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = mtk_mac_select_pcs,
 	.mac_pcs_get_state = mtk_mac_pcs_get_state,
 	.mac_config = mtk_mac_config,
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_phylink.c b/drivers/net/ethernet/microchip/lan966x/lan966x_phylink.c
index f9579a2ae676..c5f9803e6e63 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_phylink.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_phylink.c
@@ -124,7 +124,6 @@ static void lan966x_pcs_aneg_restart(struct phylink_pcs *pcs)
 }
 
 const struct phylink_mac_ops lan966x_phylink_mac_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = lan966x_phylink_mac_select,
 	.mac_config = lan966x_phylink_mac_config,
 	.mac_prepare = lan966x_phylink_mac_prepare,
diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_phylink.c b/drivers/net/ethernet/microchip/sparx5/sparx5_phylink.c
index 830da0e5ff27..bb97d27a1da4 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_phylink.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_phylink.c
@@ -138,7 +138,6 @@ const struct phylink_pcs_ops sparx5_phylink_pcs_ops = {
 };
 
 const struct phylink_mac_ops sparx5_phylink_mac_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = sparx5_phylink_mac_select_pcs,
 	.mac_config = sparx5_phylink_mac_config,
 	.mac_link_down = sparx5_phylink_mac_link_down,
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index f50dada2bb8e..ca4bde861397 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -1727,7 +1727,6 @@ static void vsc7514_phylink_mac_link_up(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops ocelot_phylink_ops = {
-	.validate		= phylink_generic_validate,
 	.mac_config		= vsc7514_phylink_mac_config,
 	.mac_link_down		= vsc7514_phylink_mac_link_down,
 	.mac_link_up		= vsc7514_phylink_mac_link_up,
diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c
index f0168fedfef9..4f7af6393c9e 100644
--- a/drivers/net/ethernet/renesas/rswitch.c
+++ b/drivers/net/ethernet/renesas/rswitch.c
@@ -1190,7 +1190,6 @@ static void rswitch_mac_link_up(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops rswitch_phylink_ops = {
-	.validate = phylink_generic_validate,
 	.mac_config = rswitch_mac_config,
 	.mac_link_down = rswitch_mac_link_down,
 	.mac_link_up = rswitch_mac_link_up,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 8273e6a175c8..2fea8785aaf4 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1080,7 +1080,6 @@ static void stmmac_mac_link_up(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops stmmac_phylink_mac_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = stmmac_mac_select_pcs,
 	.mac_config = stmmac_mac_config,
 	.mac_link_down = stmmac_mac_link_down,
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index c785e01d025f..75429ff0c200 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -1520,7 +1520,6 @@ static void am65_cpsw_nuss_mac_link_up(struct phylink_config *config, struct phy
 }
 
 static const struct phylink_mac_ops am65_cpsw_phylink_mac_ops = {
-	.validate = phylink_generic_validate,
 	.mac_config = am65_cpsw_nuss_mac_config,
 	.mac_link_down = am65_cpsw_nuss_mac_link_down,
 	.mac_link_up = am65_cpsw_nuss_mac_link_up,
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index d082f2b10f4d..3e310b55bce2 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1736,7 +1736,6 @@ static void axienet_mac_link_up(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops axienet_phylink_ops = {
-	.validate = phylink_generic_validate,
 	.mac_select_pcs = axienet_mac_select_pcs,
 	.mac_config = axienet_mac_config,
 	.mac_link_down = axienet_mac_link_down,
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 88f60e98b760..25feab1802ee 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -649,7 +649,10 @@ static int phylink_validate_mac_and_pcs(struct phylink *pl,
 	}
 
 	/* Then validate the link parameters with the MAC */
-	pl->mac_ops->validate(pl->config, supported, state);
+	if (pl->mac_ops->validate)
+		pl->mac_ops->validate(pl->config, supported, state);
+	else
+		phylink_generic_validate(pl->config, supported, state);
 
 	return phylink_is_empty_linkmode(supported) ? -EINVAL : 0;
 }
diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c
index 02941d97d034..0fe3773c5bca 100644
--- a/drivers/net/usb/asix_devices.c
+++ b/drivers/net/usb/asix_devices.c
@@ -773,7 +773,6 @@ static void ax88772_mac_link_up(struct phylink_config *config,
 }
 
 static const struct phylink_mac_ops ax88772_phylink_mac_ops = {
-	.validate = phylink_generic_validate,
 	.mac_config = ax88772_mac_config,
 	.mac_link_down = ax88772_mac_link_down,
 	.mac_link_up = ax88772_mac_link_up,
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 1df3e5e316e8..c492c26202b5 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -207,6 +207,11 @@ struct phylink_mac_ops {
  *
  * If the @state->interface mode is not supported, then the @supported
  * mask must be cleared.
+ *
+ * This member is optional; if not set, the generic validator will be
+ * used making use of @config->mac_capabilities and
+ * @config->supported_interfaces to determine which link modes are
+ * supported.
  */
 void validate(struct phylink_config *config, unsigned long *supported,
 	      struct phylink_link_state *state);
-- 
cgit v1.2.3


From 9a0f830f80265bd1ef816e1541ac24bee80e9a3c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 4 Nov 2022 12:01:25 -0700
Subject: ethtool: linkstate: add a statistic for PHY down events

The previous attempt to augment carrier_down (see Link)
was not met with much enthusiasm so let's do the simple
thing of exposing what some devices already maintain.
Add a common ethtool statistic for link going down.
Currently users have to maintain per-driver mapping
to extract the right stat from the vendor-specific ethtool -S
stats. carrier_down does not fit the bill because it counts
a lot of software related false positives.

Add the statistic to the extended link state API to steer
vendors towards implementing all of it.

Implement for bnxt and all Linux-controlled PHYs. mlx5 and (possibly)
enic also have a counter for this but I leave the implementation
to their maintainers.

Link: https://lore.kernel.org/r/20220520004500.2250674-1-kuba@kernel.org
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Michael Chan <michael.chan@broadcom.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20221104190125.684910-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/ethtool-netlink.rst      |  1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 15 ++++++++++++++
 drivers/net/phy/phy.c                             |  1 +
 include/linux/ethtool.h                           | 17 ++++++++++++++++
 include/linux/phy.h                               |  3 +++
 include/uapi/linux/ethtool_netlink.h              |  1 +
 net/ethtool/linkstate.c                           | 24 ++++++++++++++++++++++-
 7 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index d578b8bcd8a4..bede24ef44fd 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -491,6 +491,7 @@ Kernel response contents:
   ``ETHTOOL_A_LINKSTATE_SQI_MAX``       u32     Max support SQI value
   ``ETHTOOL_A_LINKSTATE_EXT_STATE``     u8      link extended state
   ``ETHTOOL_A_LINKSTATE_EXT_SUBSTATE``  u8      link extended substate
+  ``ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT``  u32     count of link down events
   ====================================  ======  ============================
 
 For most NIC drivers, the value of ``ETHTOOL_A_LINKSTATE_LINK`` returns
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index cc89e5eabcb9..d8f0351df954 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -4112,6 +4112,20 @@ static void bnxt_get_rmon_stats(struct net_device *dev,
 	*ranges = bnxt_rmon_ranges;
 }
 
+static void bnxt_get_link_ext_stats(struct net_device *dev,
+				    struct ethtool_link_ext_stats *stats)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	u64 *rx;
+
+	if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_PORT_STATS_EXT))
+		return;
+
+	rx = bp->rx_port_stats_ext.sw_stats;
+	stats->link_down_events =
+		*(rx + BNXT_RX_STATS_EXT_OFFSET(link_down_events));
+}
+
 void bnxt_ethtool_free(struct bnxt *bp)
 {
 	kfree(bp->test_info);
@@ -4161,6 +4175,7 @@ const struct ethtool_ops bnxt_ethtool_ops = {
 	.get_eeprom             = bnxt_get_eeprom,
 	.set_eeprom		= bnxt_set_eeprom,
 	.get_link		= bnxt_get_link,
+	.get_link_ext_stats	= bnxt_get_link_ext_stats,
 	.get_eee		= bnxt_get_eee,
 	.set_eee		= bnxt_set_eee,
 	.get_module_info	= bnxt_get_module_info,
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index e741d8aebffe..e5b6cb1a77f9 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -67,6 +67,7 @@ static void phy_link_down(struct phy_device *phydev)
 {
 	phydev->phy_link_change(phydev, false);
 	phy_led_trigger_change_speed(phydev);
+	WRITE_ONCE(phydev->link_down_events, phydev->link_down_events + 1);
 }
 
 static const char *phy_pause_str(struct phy_device *phydev)
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 99dc7bfbcd3c..5c51c7fda32a 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -125,6 +125,20 @@ struct ethtool_link_ext_state_info {
 	};
 };
 
+struct ethtool_link_ext_stats {
+	/* Custom Linux statistic for PHY level link down events.
+	 * In a simpler world it should be equal to netdev->carrier_down_count
+	 * unfortunately netdev also counts local reconfigurations which don't
+	 * actually take the physical link down, not to mention NC-SI which,
+	 * if present, keeps the link up regardless of host state.
+	 * This statistic counts when PHY _actually_ went down, or lost link.
+	 *
+	 * Note that we need u64 for ethtool_stats_init() and comparisons
+	 * to ETHTOOL_STAT_NOT_SET, but only u32 is exposed to the user.
+	 */
+	u64 link_down_events;
+};
+
 /**
  * ethtool_rxfh_indir_default - get default value for RX flow hash indirection
  * @index: Index in RX flow hash indirection table
@@ -481,6 +495,7 @@ struct ethtool_module_power_mode_params {
  *	do not attach ext_substate attribute to netlink message). If link_ext_state
  *	and link_ext_substate are unknown, return -ENODATA. If not implemented,
  *	link_ext_state and link_ext_substate will not be sent to userspace.
+ * @get_link_ext_stats: Read extra link-related counters.
  * @get_eeprom_len: Read range of EEPROM addresses for validation of
  *	@get_eeprom and @set_eeprom requests.
  *	Returns 0 if device does not support EEPROM access.
@@ -652,6 +667,8 @@ struct ethtool_ops {
 	u32	(*get_link)(struct net_device *);
 	int	(*get_link_ext_state)(struct net_device *,
 				      struct ethtool_link_ext_state_info *);
+	void	(*get_link_ext_stats)(struct net_device *dev,
+				      struct ethtool_link_ext_stats *stats);
 	int	(*get_eeprom_len)(struct net_device *);
 	int	(*get_eeprom)(struct net_device *,
 			      struct ethtool_eeprom *, u8 *);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index ddf66198f751..9a3752c0c444 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -600,6 +600,7 @@ struct macsec_ops;
  * @psec: Pointer to Power Sourcing Equipment control struct
  * @lock:  Mutex for serialization access to PHY
  * @state_queue: Work queue for state machine
+ * @link_down_events: Number of times link was lost
  * @shared: Pointer to private data shared by phys in one package
  * @priv: Pointer to driver private data
  *
@@ -723,6 +724,8 @@ struct phy_device {
 
 	int pma_extable;
 
+	unsigned int link_down_events;
+
 	void (*phy_link_change)(struct phy_device *phydev, bool up);
 	void (*adjust_link)(struct net_device *dev);
 
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index bb57084ac524..aaf7c6963d61 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -262,6 +262,7 @@ enum {
 	ETHTOOL_A_LINKSTATE_SQI_MAX,		/* u32 */
 	ETHTOOL_A_LINKSTATE_EXT_STATE,		/* u8 */
 	ETHTOOL_A_LINKSTATE_EXT_SUBSTATE,	/* u8 */
+	ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT,	/* u32 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_LINKSTATE_CNT,
diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c
index fb676f349455..2158c17a0b32 100644
--- a/net/ethtool/linkstate.c
+++ b/net/ethtool/linkstate.c
@@ -13,6 +13,7 @@ struct linkstate_reply_data {
 	int					link;
 	int					sqi;
 	int					sqi_max;
+	struct ethtool_link_ext_stats		link_stats;
 	bool					link_ext_state_provided;
 	struct ethtool_link_ext_state_info	ethtool_link_ext_state_info;
 };
@@ -22,7 +23,7 @@ struct linkstate_reply_data {
 
 const struct nla_policy ethnl_linkstate_get_policy[] = {
 	[ETHTOOL_A_LINKSTATE_HEADER]		=
-		NLA_POLICY_NESTED(ethnl_header_policy),
+		NLA_POLICY_NESTED(ethnl_header_policy_stats),
 };
 
 static int linkstate_get_sqi(struct net_device *dev)
@@ -107,6 +108,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
 			goto out;
 	}
 
+	ethtool_stats_init((u64 *)&data->link_stats,
+			   sizeof(data->link_stats) / 8);
+
+	if (req_base->flags & ETHTOOL_FLAG_STATS) {
+		if (dev->phydev)
+			data->link_stats.link_down_events =
+				READ_ONCE(dev->phydev->link_down_events);
+
+		if (dev->ethtool_ops->get_link_ext_stats)
+			dev->ethtool_ops->get_link_ext_stats(dev,
+							     &data->link_stats);
+	}
+
 	ret = 0;
 out:
 	ethnl_ops_complete(dev);
@@ -134,6 +148,9 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base,
 	if (data->ethtool_link_ext_state_info.__link_ext_substate)
 		len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */
 
+	if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET)
+		len += nla_total_size(sizeof(u32));
+
 	return len;
 }
 
@@ -166,6 +183,11 @@ static int linkstate_fill_reply(struct sk_buff *skb,
 			return -EMSGSIZE;
 	}
 
+	if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET)
+		if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT,
+				data->link_stats.link_down_events))
+			return -EMSGSIZE;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From c3d96f690a790074b508fe183a41e36a00cd7ddd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 3 Oct 2022 07:34:21 +0100
Subject: net, proc: Provide PROC_FS=n fallback for
 proc_create_net_single_write()

Provide a CONFIG_PROC_FS=n fallback for proc_create_net_single_write().

Also provide a fallback for proc_create_net_data_write().

Fixes: 564def71765c ("proc: Add a way to make network proc files writable")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
cc: netdev@vger.kernel.org
---
 include/linux/proc_fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 81d6e4ec2294..0260f5ea98fe 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -208,8 +208,10 @@ static inline void proc_remove(struct proc_dir_entry *de) {}
 static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { return 0; }
 
 #define proc_create_net_data(name, mode, parent, ops, state_size, data) ({NULL;})
+#define proc_create_net_data_write(name, mode, parent, ops, write, state_size, data) ({NULL;})
 #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
 #define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
+#define proc_create_net_single_write(name, mode, parent, show, write, data) ({NULL;})
 
 static inline struct pid *tgid_pidfd_to_pid(const struct file *file)
 {
-- 
cgit v1.2.3


From 42fb06b391ace2aec5cdb1ebb8ff668f0a34332f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 12 Oct 2022 08:49:29 +0100
Subject: net: Change the udp encap_err_rcv to allow use of
 {ip,ipv6}_icmp_error()

Change the udp encap_err_rcv signature to match ip_icmp_error() and
ipv6_icmp_error() so that those can be used from the called function and
export them.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
cc: netdev@vger.kernel.org
---
 include/linux/udp.h      |  3 +-
 include/net/udp_tunnel.h |  4 +--
 net/ipv4/ip_sockglue.c   |  1 +
 net/ipv4/udp.c           |  3 +-
 net/ipv6/datagram.c      |  1 +
 net/ipv6/udp.c           |  3 +-
 net/rxrpc/ar-internal.h  |  2 +-
 net/rxrpc/peer_event.c   | 71 ++++++++++++------------------------------------
 8 files changed, 28 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 5cdba00a904a..dea57aa37df6 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -70,7 +70,8 @@ struct udp_sock {
 	 * For encapsulation sockets.
 	 */
 	int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
-	void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset);
+	void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, int err,
+			      __be16 port, u32 info, u8 *payload);
 	int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb);
 	void (*encap_destroy)(struct sock *sk);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 72394f441dad..0ca9b7a11baf 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -68,8 +68,8 @@ typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
 typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk,
 					     struct sk_buff *skb);
 typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk,
-					   struct sk_buff *skb,
-					   unsigned int udp_offset);
+					   struct sk_buff *skb, int err,
+					   __be16 port, u32 info, u8 *payload);
 typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
 typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk,
 						    struct list_head *head,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5f16807d3235..9f92ae35bb01 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -433,6 +433,7 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
 	}
 	kfree_skb(skb);
 }
+EXPORT_SYMBOL_GPL(ip_icmp_error);
 
 void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
 {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 89accc3c8bb3..b859d6c8298e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -784,7 +784,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	if (tunnel) {
 		/* ...not for tunnels though: we don't have a sending socket */
 		if (udp_sk(sk)->encap_err_rcv)
-			udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2);
+			udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info,
+						  (u8 *)(uh+1));
 		goto out;
 	}
 	if (!inet->recverr) {
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index df7e032ce87d..7c7155b48f17 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -334,6 +334,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
 	if (sock_queue_err_skb(sk, skb))
 		kfree_skb(skb);
 }
+EXPORT_SYMBOL_GPL(ipv6_icmp_error);
 
 void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
 {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 297f7cc06044..4bc3fc27ec78 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -631,7 +631,8 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	/* Tunnels don't have an application socket: don't pass errors back */
 	if (tunnel) {
 		if (udp_sk(sk)->encap_err_rcv)
-			udp_sk(sk)->encap_err_rcv(sk, skb, offset);
+			udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest,
+						  ntohl(info), (u8 *)(uh+1));
 		goto out;
 	}
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 436a1e8d0abd..51270b2e49c3 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -998,7 +998,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *);
 /*
  * peer_event.c
  */
-void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset);
+void rxrpc_encap_err_rcv(struct sock *, struct sk_buff *, int, __be16, u32, u8 *);
 void rxrpc_error_report(struct sock *);
 void rxrpc_peer_keepalive_worker(struct work_struct *);
 
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 32561e9567fe..d7d6d7aff985 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -29,20 +29,16 @@ static void rxrpc_distribute_error(struct rxrpc_peer *, int,
  */
 static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 						     struct sk_buff *skb,
-						     unsigned int udp_offset,
-						     unsigned int *info,
+						     __be16 udp_port,
 						     struct sockaddr_rxrpc *srx)
 {
 	struct iphdr *ip, *ip0 = ip_hdr(skb);
 	struct icmphdr *icmp = icmp_hdr(skb);
-	struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset);
 
 	_enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code);
 
 	switch (icmp->type) {
 	case ICMP_DEST_UNREACH:
-		*info = ntohs(icmp->un.frag.mtu);
-		fallthrough;
 	case ICMP_TIME_EXCEEDED:
 	case ICMP_PARAMETERPROB:
 		ip = (struct iphdr *)((void *)icmp + 8);
@@ -63,7 +59,7 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 	case AF_INET:
 		srx->transport_len = sizeof(srx->transport.sin);
 		srx->transport.family = AF_INET;
-		srx->transport.sin.sin_port = udp->dest;
+		srx->transport.sin.sin_port = udp_port;
 		memcpy(&srx->transport.sin.sin_addr, &ip->daddr,
 		       sizeof(struct in_addr));
 		break;
@@ -72,7 +68,7 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 	case AF_INET6:
 		srx->transport_len = sizeof(srx->transport.sin);
 		srx->transport.family = AF_INET;
-		srx->transport.sin.sin_port = udp->dest;
+		srx->transport.sin.sin_port = udp_port;
 		memcpy(&srx->transport.sin.sin_addr, &ip->daddr,
 		       sizeof(struct in_addr));
 		break;
@@ -93,20 +89,16 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
  */
 static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local,
 						      struct sk_buff *skb,
-						      unsigned int udp_offset,
-						      unsigned int *info,
+						      __be16 udp_port,
 						      struct sockaddr_rxrpc *srx)
 {
 	struct icmp6hdr *icmp = icmp6_hdr(skb);
 	struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb);
-	struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset);
 
 	_enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code);
 
 	switch (icmp->icmp6_type) {
 	case ICMPV6_DEST_UNREACH:
-		*info = ntohl(icmp->icmp6_mtu);
-		fallthrough;
 	case ICMPV6_PKT_TOOBIG:
 	case ICMPV6_TIME_EXCEED:
 	case ICMPV6_PARAMPROB:
@@ -129,13 +121,13 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local,
 		_net("Rx ICMP6 on v4 sock");
 		srx->transport_len = sizeof(srx->transport.sin);
 		srx->transport.family = AF_INET;
-		srx->transport.sin.sin_port = udp->dest;
+		srx->transport.sin.sin_port = udp_port;
 		memcpy(&srx->transport.sin.sin_addr,
 		       &ip->daddr.s6_addr32[3], sizeof(struct in_addr));
 		break;
 	case AF_INET6:
 		_net("Rx ICMP6");
-		srx->transport.sin.sin_port = udp->dest;
+		srx->transport.sin.sin_port = udp_port;
 		memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr,
 		       sizeof(struct in6_addr));
 		break;
@@ -152,15 +144,13 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local,
 /*
  * Handle an error received on the local endpoint as a tunnel.
  */
-void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb,
-			 unsigned int udp_offset)
+void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err,
+			 __be16 port, u32 info, u8 *payload)
 {
 	struct sock_extended_err ee;
 	struct sockaddr_rxrpc srx;
 	struct rxrpc_local *local;
 	struct rxrpc_peer *peer;
-	unsigned int info = 0;
-	int err;
 	u8 version = ip_hdr(skb)->version;
 	u8 type = icmp_hdr(skb)->type;
 	u8 code = icmp_hdr(skb)->code;
@@ -176,13 +166,11 @@ void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb,
 
 	switch (ip_hdr(skb)->version) {
 	case IPVERSION:
-		peer = rxrpc_lookup_peer_icmp_rcu(local, skb, udp_offset,
-						  &info, &srx);
+		peer = rxrpc_lookup_peer_icmp_rcu(local, skb, port, &srx);
 		break;
 #ifdef CONFIG_AF_RXRPC_IPV6
 	case 6:
-		peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, udp_offset,
-						   &info, &srx);
+		peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, port, &srx);
 		break;
 #endif
 	default:
@@ -201,34 +189,12 @@ void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb,
 
 	switch (version) {
 	case IPVERSION:
-		switch (type) {
-		case ICMP_DEST_UNREACH:
-			switch (code) {
-			case ICMP_FRAG_NEEDED:
-				rxrpc_adjust_mtu(peer, info);
-				rcu_read_unlock();
-				rxrpc_put_peer(peer);
-				return;
-			default:
-				break;
-			}
-
-			err = EHOSTUNREACH;
-			if (code <= NR_ICMP_UNREACH) {
-				/* Might want to do something different with
-				 * non-fatal errors
-				 */
-				//harderr = icmp_err_convert[code].fatal;
-				err = icmp_err_convert[code].errno;
-			}
-			break;
-
-		case ICMP_TIME_EXCEEDED:
-			err = EHOSTUNREACH;
-			break;
-		default:
-			err = EPROTO;
-			break;
+		if (type == ICMP_DEST_UNREACH &&
+		    code == ICMP_FRAG_NEEDED) {
+			rxrpc_adjust_mtu(peer, info);
+			rcu_read_unlock();
+			rxrpc_put_peer(peer);
+			return;
 		}
 
 		ee.ee_origin = SO_EE_ORIGIN_ICMP;
@@ -239,16 +205,13 @@ void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb,
 
 #ifdef CONFIG_AF_RXRPC_IPV6
 	case 6:
-		switch (type) {
-		case ICMPV6_PKT_TOOBIG:
+		if (type == ICMPV6_PKT_TOOBIG) {
 			rxrpc_adjust_mtu(peer, info);
 			rcu_read_unlock();
 			rxrpc_put_peer(peer);
 			return;
 		}
 
-		icmpv6_err_convert(type, code, &err);
-
 		if (err == EACCES)
 			err = EHOSTUNREACH;
 
-- 
cgit v1.2.3


From bd039b5ea2a91ea707ee8539df26456bd5be80af Mon Sep 17 00:00:00 2001
From: Andy Ren <andy.ren@getcruise.com>
Date: Mon, 7 Nov 2022 09:42:42 -0800
Subject: net/core: Allow live renaming when an interface is up

Allow a network interface to be renamed when the interface
is up.

As described in the netconsole documentation [1], when netconsole is
used as a built-in, it will bring up the specified interface as soon as
possible. As a result, user space will not be able to rename the
interface since the kernel disallows renaming of interfaces that are
administratively up unless the 'IFF_LIVE_RENAME_OK' private flag was set
by the kernel.

The original solution [2] to this problem was to add a new parameter to
the netconsole configuration parameters that allows renaming of
the interface used by netconsole while it is administratively up.
However, during the discussion that followed, it became apparent that we
have no reason to keep the current restriction and instead we should
allow user space to rename interfaces regardless of their administrative
state:

1. The restriction was put in place over 20 years ago when renaming was
only possible via IOCTL and before rtnetlink started notifying user
space about such changes like it does today.

2. The 'IFF_LIVE_RENAME_OK' flag was added over 3 years ago in version
5.2 and no regressions were reported.

3. In-kernel listeners to 'NETDEV_CHANGENAME' do not seem to care about
the administrative state of interface.

Therefore, allow user space to rename running interfaces by removing the
restriction and the associated 'IFF_LIVE_RENAME_OK' flag. Help in
possible triage by emitting a message to the kernel log that an
interface was renamed while UP.

[1] https://www.kernel.org/doc/Documentation/networking/netconsole.rst
[2] https://lore.kernel.org/netdev/20221102002420.2613004-1-andy.ren@getcruise.com/

Signed-off-by: Andy Ren <andy.ren@getcruise.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 +---
 net/core/dev.c            | 19 ++-----------------
 net/core/failover.c       |  6 +++---
 3 files changed, 6 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d45713a06568..4be87b89e481 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1650,7 +1650,6 @@ struct net_device_ops {
  * @IFF_FAILOVER: device is a failover master device
  * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
  * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
- * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
  * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
  *	skb_headlen(skb) == 0 (data starts from frag0)
  * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN
@@ -1686,7 +1685,7 @@ enum netdev_priv_flags {
 	IFF_FAILOVER			= 1<<27,
 	IFF_FAILOVER_SLAVE		= 1<<28,
 	IFF_L3MDEV_RX_HANDLER		= 1<<29,
-	IFF_LIVE_RENAME_OK		= 1<<30,
+	/* was IFF_LIVE_RENAME_OK */
 	IFF_TX_SKB_NO_LINEAR		= BIT_ULL(31),
 	IFF_CHANGE_PROTO_DOWN		= BIT_ULL(32),
 };
@@ -1721,7 +1720,6 @@ enum netdev_priv_flags {
 #define IFF_FAILOVER			IFF_FAILOVER
 #define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE
 #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
-#define IFF_LIVE_RENAME_OK		IFF_LIVE_RENAME_OK
 #define IFF_TX_SKB_NO_LINEAR		IFF_TX_SKB_NO_LINEAR
 
 /* Specifies the type of the struct net_device::ml_priv pointer */
diff --git a/net/core/dev.c b/net/core/dev.c
index 3bacee3bee78..707de6b841d0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1163,22 +1163,6 @@ int dev_change_name(struct net_device *dev, const char *newname)
 
 	net = dev_net(dev);
 
-	/* Some auto-enslaved devices e.g. failover slaves are
-	 * special, as userspace might rename the device after
-	 * the interface had been brought up and running since
-	 * the point kernel initiated auto-enslavement. Allow
-	 * live name change even when these slave devices are
-	 * up and running.
-	 *
-	 * Typically, users of these auto-enslaving devices
-	 * don't actually care about slave name change, as
-	 * they are supposed to operate on master interface
-	 * directly.
-	 */
-	if (dev->flags & IFF_UP &&
-	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
-		return -EBUSY;
-
 	down_write(&devnet_rename_sem);
 
 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
@@ -1195,7 +1179,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
 	}
 
 	if (oldname[0] && !strchr(oldname, '%'))
-		netdev_info(dev, "renamed from %s\n", oldname);
+		netdev_info(dev, "renamed from %s%s\n", oldname,
+			    dev->flags & IFF_UP ? " (while UP)" : "");
 
 	old_assign_type = dev->name_assign_type;
 	dev->name_assign_type = NET_NAME_RENAMED;
diff --git a/net/core/failover.c b/net/core/failover.c
index 864d2d83eff4..655411c4ca51 100644
--- a/net/core/failover.c
+++ b/net/core/failover.c
@@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev)
 		goto err_upper_link;
 	}
 
-	slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
+	slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
 
 	if (fops && fops->slave_register &&
 	    !fops->slave_register(slave_dev, failover_dev))
 		return NOTIFY_OK;
 
 	netdev_upper_dev_unlink(slave_dev, failover_dev);
-	slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
+	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
 err_upper_link:
 	netdev_rx_handler_unregister(slave_dev);
 done:
@@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev)
 
 	netdev_rx_handler_unregister(slave_dev);
 	netdev_upper_dev_unlink(slave_dev, failover_dev);
-	slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
+	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
 
 	if (fops && fops->slave_unregister &&
 	    !fops->slave_unregister(slave_dev, failover_dev))
-- 
cgit v1.2.3


From 3e52fba03a20234abc65a656cef063a1045d9723 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Tue, 8 Nov 2022 14:22:06 +0100
Subject: net: introduce a helper to move notifier block to different namespace

Currently, net_dev() netdev notifier variant follows the netdev with
per-net notifier from namespace to namespace. This is implemented
by move_netdevice_notifiers_dev_net() helper.

For devlink it is needed to re-register per-net notifier during
devlink reload. Introduce a new helper called
move_netdevice_notifier_net() and share the unregister/register code
with existing move_netdevice_notifiers_dev_net() helper.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 22 ++++++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4be87b89e481..02a2318da7c7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2826,6 +2826,8 @@ int unregister_netdevice_notifier(struct notifier_block *nb);
 int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb);
 int unregister_netdevice_notifier_net(struct net *net,
 				      struct notifier_block *nb);
+void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net,
+				 struct notifier_block *nb);
 int register_netdevice_notifier_dev_net(struct net_device *dev,
 					struct notifier_block *nb,
 					struct netdev_net_notifier *nn);
diff --git a/net/core/dev.c b/net/core/dev.c
index 707de6b841d0..117e830cabb0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1861,6 +1861,22 @@ int unregister_netdevice_notifier_net(struct net *net,
 }
 EXPORT_SYMBOL(unregister_netdevice_notifier_net);
 
+static void __move_netdevice_notifier_net(struct net *src_net,
+					  struct net *dst_net,
+					  struct notifier_block *nb)
+{
+	__unregister_netdevice_notifier_net(src_net, nb);
+	__register_netdevice_notifier_net(dst_net, nb, true);
+}
+
+void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net,
+				 struct notifier_block *nb)
+{
+	rtnl_lock();
+	__move_netdevice_notifier_net(src_net, dst_net, nb);
+	rtnl_unlock();
+}
+
 int register_netdevice_notifier_dev_net(struct net_device *dev,
 					struct notifier_block *nb,
 					struct netdev_net_notifier *nn)
@@ -1897,10 +1913,8 @@ static void move_netdevice_notifiers_dev_net(struct net_device *dev,
 {
 	struct netdev_net_notifier *nn;
 
-	list_for_each_entry(nn, &dev->net_notifier_list, list) {
-		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
-		__register_netdevice_notifier_net(net, nn->nb, true);
-	}
+	list_for_each_entry(nn, &dev->net_notifier_list, list)
+		__move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
 }
 
 /**
-- 
cgit v1.2.3


From f6479ea4e5990e17b9690d66474198dcdba1b2c1 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 8 Nov 2022 14:25:56 +0000
Subject: net: mdio: add mdiodev_c45_(read|write)

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mdio.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 00177567cfef..f7fbbf3069e7 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -488,6 +488,19 @@ static inline int mdiobus_c45_write(struct mii_bus *bus, int prtad, int devad,
 	return mdiobus_write(bus, prtad, mdiobus_c45_addr(devad, regnum), val);
 }
 
+static inline int mdiodev_c45_read(struct mdio_device *mdiodev, int devad,
+				   u16 regnum)
+{
+	return mdiobus_c45_read(mdiodev->bus, mdiodev->addr, devad, regnum);
+}
+
+static inline int mdiodev_c45_write(struct mdio_device *mdiodev, u32 devad,
+				    u16 regnum, u16 val)
+{
+	return mdiobus_c45_write(mdiodev->bus, mdiodev->addr, devad, regnum,
+				 val);
+}
+
 int mdiobus_register_device(struct mdio_device *mdiodev);
 int mdiobus_unregister_device(struct mdio_device *mdiodev);
 bool mdiobus_is_registered_device(struct mii_bus *bus, int addr);
-- 
cgit v1.2.3


From cc514101a97e3fb48f8617c8f291db798d10d831 Mon Sep 17 00:00:00 2001
From: Sujuan Chen <sujuan.chen@mediatek.com>
Date: Sat, 5 Nov 2022 23:36:18 +0100
Subject: net: ethernet: mtk_wed: introduce wed mcu support

Introduce WED mcu support used to configure WED WO chip.
This is a preliminary patch in order to add RX Wireless
Ethernet Dispatch available on MT7986 SoC.

Tested-by: Daniel Golle <daniel@makrotopia.org>
Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mediatek/Makefile       |   2 +-
 drivers/net/ethernet/mediatek/mtk_wed_mcu.c  | 359 +++++++++++++++++++++++++++
 drivers/net/ethernet/mediatek/mtk_wed_regs.h |   1 +
 drivers/net/ethernet/mediatek/mtk_wed_wo.h   | 150 +++++++++++
 include/linux/soc/mediatek/mtk_wed.h         |  29 +++
 5 files changed, 540 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mediatek/mtk_wed_mcu.c
 create mode 100644 drivers/net/ethernet/mediatek/mtk_wed_wo.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/Makefile b/drivers/net/ethernet/mediatek/Makefile
index 45ba0970504a..d4bdefa77159 100644
--- a/drivers/net/ethernet/mediatek/Makefile
+++ b/drivers/net/ethernet/mediatek/Makefile
@@ -5,7 +5,7 @@
 
 obj-$(CONFIG_NET_MEDIATEK_SOC) += mtk_eth.o
 mtk_eth-y := mtk_eth_soc.o mtk_sgmii.o mtk_eth_path.o mtk_ppe.o mtk_ppe_debugfs.o mtk_ppe_offload.o
-mtk_eth-$(CONFIG_NET_MEDIATEK_SOC_WED) += mtk_wed.o
+mtk_eth-$(CONFIG_NET_MEDIATEK_SOC_WED) += mtk_wed.o mtk_wed_mcu.o
 ifdef CONFIG_DEBUG_FS
 mtk_eth-$(CONFIG_NET_MEDIATEK_SOC_WED) += mtk_wed_debugfs.o
 endif
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_mcu.c b/drivers/net/ethernet/mediatek/mtk_wed_mcu.c
new file mode 100644
index 000000000000..ce40d58000a0
--- /dev/null
+++ b/drivers/net/ethernet/mediatek/mtk_wed_mcu.c
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2022 MediaTek Inc.
+ *
+ * Author: Lorenzo Bianconi <lorenzo@kernel.org>
+ *	   Sujuan Chen <sujuan.chen@mediatek.com>
+ */
+
+#include <linux/firmware.h>
+#include <linux/of_address.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/mfd/syscon.h>
+#include <linux/soc/mediatek/mtk_wed.h>
+
+#include "mtk_wed_regs.h"
+#include "mtk_wed_wo.h"
+#include "mtk_wed.h"
+
+static u32 wo_r32(struct mtk_wed_wo *wo, u32 reg)
+{
+	return readl(wo->boot.addr + reg);
+}
+
+static void wo_w32(struct mtk_wed_wo *wo, u32 reg, u32 val)
+{
+	writel(val, wo->boot.addr + reg);
+}
+
+static struct sk_buff *
+mtk_wed_mcu_msg_alloc(const void *data, int data_len)
+{
+	int length = sizeof(struct mtk_wed_mcu_hdr) + data_len;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(length, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	memset(skb->head, 0, length);
+	skb_reserve(skb, sizeof(struct mtk_wed_mcu_hdr));
+	if (data && data_len)
+		skb_put_data(skb, data, data_len);
+
+	return skb;
+}
+
+static struct sk_buff *
+mtk_wed_mcu_get_response(struct mtk_wed_wo *wo, unsigned long expires)
+{
+	if (!time_is_after_jiffies(expires))
+		return NULL;
+
+	wait_event_timeout(wo->mcu.wait, !skb_queue_empty(&wo->mcu.res_q),
+			   expires - jiffies);
+	return skb_dequeue(&wo->mcu.res_q);
+}
+
+void mtk_wed_mcu_rx_event(struct mtk_wed_wo *wo, struct sk_buff *skb)
+{
+	skb_queue_tail(&wo->mcu.res_q, skb);
+	wake_up(&wo->mcu.wait);
+}
+
+void mtk_wed_mcu_rx_unsolicited_event(struct mtk_wed_wo *wo,
+				      struct sk_buff *skb)
+{
+	struct mtk_wed_mcu_hdr *hdr = (struct mtk_wed_mcu_hdr *)skb->data;
+
+	switch (hdr->cmd) {
+	case MTK_WED_WO_EVT_LOG_DUMP: {
+		const char *msg = (const char *)(skb->data + sizeof(*hdr));
+
+		dev_notice(wo->hw->dev, "%s\n", msg);
+		break;
+	}
+	case MTK_WED_WO_EVT_PROFILING: {
+		struct mtk_wed_wo_log_info *info;
+		u32 count = (skb->len - sizeof(*hdr)) / sizeof(*info);
+		int i;
+
+		info = (struct mtk_wed_wo_log_info *)(skb->data + sizeof(*hdr));
+		for (i = 0 ; i < count ; i++)
+			dev_notice(wo->hw->dev,
+				   "SN:%u latency: total=%u, rro:%u, mod:%u\n",
+				   le32_to_cpu(info[i].sn),
+				   le32_to_cpu(info[i].total),
+				   le32_to_cpu(info[i].rro),
+				   le32_to_cpu(info[i].mod));
+		break;
+	}
+	case MTK_WED_WO_EVT_RXCNT_INFO:
+		break;
+	default:
+		break;
+	}
+
+	dev_kfree_skb(skb);
+}
+
+static int
+mtk_wed_mcu_skb_send_msg(struct mtk_wed_wo *wo, struct sk_buff *skb,
+			 int id, int cmd, u16 *wait_seq, bool wait_resp)
+{
+	struct mtk_wed_mcu_hdr *hdr;
+
+	/* TODO: make it dynamic based on cmd */
+	wo->mcu.timeout = 20 * HZ;
+
+	hdr = (struct mtk_wed_mcu_hdr *)skb_push(skb, sizeof(*hdr));
+	hdr->cmd = cmd;
+	hdr->length = cpu_to_le16(skb->len);
+
+	if (wait_resp && wait_seq) {
+		u16 seq = ++wo->mcu.seq;
+
+		if (!seq)
+			seq = ++wo->mcu.seq;
+		*wait_seq = seq;
+
+		hdr->flag |= cpu_to_le16(MTK_WED_WARP_CMD_FLAG_NEED_RSP);
+		hdr->seq = cpu_to_le16(seq);
+	}
+	if (id == MTK_WED_MODULE_ID_WO)
+		hdr->flag |= cpu_to_le16(MTK_WED_WARP_CMD_FLAG_FROM_TO_WO);
+
+	dev_kfree_skb(skb);
+	return 0;
+}
+
+static int
+mtk_wed_mcu_parse_response(struct mtk_wed_wo *wo, struct sk_buff *skb,
+			   int cmd, int seq)
+{
+	struct mtk_wed_mcu_hdr *hdr;
+
+	if (!skb) {
+		dev_err(wo->hw->dev, "Message %08x (seq %d) timeout\n",
+			cmd, seq);
+		return -ETIMEDOUT;
+	}
+
+	hdr = (struct mtk_wed_mcu_hdr *)skb->data;
+	if (le16_to_cpu(hdr->seq) != seq)
+		return -EAGAIN;
+
+	skb_pull(skb, sizeof(*hdr));
+	switch (cmd) {
+	case MTK_WED_WO_CMD_RXCNT_INFO:
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+int mtk_wed_mcu_send_msg(struct mtk_wed_wo *wo, int id, int cmd,
+			 const void *data, int len, bool wait_resp)
+{
+	unsigned long expires;
+	struct sk_buff *skb;
+	u16 seq;
+	int ret;
+
+	skb = mtk_wed_mcu_msg_alloc(data, len);
+	if (!skb)
+		return -ENOMEM;
+
+	mutex_lock(&wo->mcu.mutex);
+
+	ret = mtk_wed_mcu_skb_send_msg(wo, skb, id, cmd, &seq, wait_resp);
+	if (ret || !wait_resp)
+		goto unlock;
+
+	expires = jiffies + wo->mcu.timeout;
+	do {
+		skb = mtk_wed_mcu_get_response(wo, expires);
+		ret = mtk_wed_mcu_parse_response(wo, skb, cmd, seq);
+		dev_kfree_skb(skb);
+	} while (ret == -EAGAIN);
+
+unlock:
+	mutex_unlock(&wo->mcu.mutex);
+
+	return ret;
+}
+
+static int
+mtk_wed_get_memory_region(struct mtk_wed_wo *wo,
+			  struct mtk_wed_wo_memory_region *region)
+{
+	struct reserved_mem *rmem;
+	struct device_node *np;
+	int index;
+
+	index = of_property_match_string(wo->hw->node, "memory-region-names",
+					 region->name);
+	if (index < 0)
+		return index;
+
+	np = of_parse_phandle(wo->hw->node, "memory-region", index);
+	if (!np)
+		return -ENODEV;
+
+	rmem = of_reserved_mem_lookup(np);
+	of_node_put(np);
+
+	if (!rmem)
+		return -ENODEV;
+
+	region->phy_addr = rmem->base;
+	region->size = rmem->size;
+	region->addr = devm_ioremap(wo->hw->dev, region->phy_addr, region->size);
+
+	return !region->addr ? -EINVAL : 0;
+}
+
+static int
+mtk_wed_mcu_run_firmware(struct mtk_wed_wo *wo, const struct firmware *fw,
+			 struct mtk_wed_wo_memory_region *region)
+{
+	const u8 *first_region_ptr, *region_ptr, *trailer_ptr, *ptr = fw->data;
+	const struct mtk_wed_fw_trailer *trailer;
+	const struct mtk_wed_fw_region *fw_region;
+
+	trailer_ptr = fw->data + fw->size - sizeof(*trailer);
+	trailer = (const struct mtk_wed_fw_trailer *)trailer_ptr;
+	region_ptr = trailer_ptr - trailer->num_region * sizeof(*fw_region);
+	first_region_ptr = region_ptr;
+
+	while (region_ptr < trailer_ptr) {
+		u32 length;
+
+		fw_region = (const struct mtk_wed_fw_region *)region_ptr;
+		length = le32_to_cpu(fw_region->len);
+
+		if (region->phy_addr != le32_to_cpu(fw_region->addr))
+			goto next;
+
+		if (region->size < length)
+			goto next;
+
+		if (first_region_ptr < ptr + length)
+			goto next;
+
+		if (region->shared && region->consumed)
+			return 0;
+
+		if (!region->shared || !region->consumed) {
+			memcpy_toio(region->addr, ptr, length);
+			region->consumed = true;
+			return 0;
+		}
+next:
+		region_ptr += sizeof(*fw_region);
+		ptr += length;
+	}
+
+	return -EINVAL;
+}
+
+static int
+mtk_wed_mcu_load_firmware(struct mtk_wed_wo *wo)
+{
+	static struct mtk_wed_wo_memory_region mem_region[] = {
+		[MTK_WED_WO_REGION_EMI] = {
+			.name = "wo-emi",
+		},
+		[MTK_WED_WO_REGION_ILM] = {
+			.name = "wo-ilm",
+		},
+		[MTK_WED_WO_REGION_DATA] = {
+			.name = "wo-data",
+			.shared = true,
+		},
+	};
+	const struct mtk_wed_fw_trailer *trailer;
+	const struct firmware *fw;
+	const char *fw_name;
+	u32 val, boot_cr;
+	int ret, i;
+
+	/* load firmware region metadata */
+	for (i = 0; i < ARRAY_SIZE(mem_region); i++) {
+		ret = mtk_wed_get_memory_region(wo, &mem_region[i]);
+		if (ret)
+			return ret;
+	}
+
+	wo->boot.name = "wo-boot";
+	ret = mtk_wed_get_memory_region(wo, &wo->boot);
+	if (ret)
+		return ret;
+
+	/* set dummy cr */
+	wed_w32(wo->hw->wed_dev, MTK_WED_SCR0 + 4 * MTK_WED_DUMMY_CR_FWDL,
+		wo->hw->index + 1);
+
+	/* load firmware */
+	fw_name = wo->hw->index ? MT7986_FIRMWARE_WO1 : MT7986_FIRMWARE_WO0;
+	ret = request_firmware(&fw, fw_name, wo->hw->dev);
+	if (ret)
+		return ret;
+
+	trailer = (void *)(fw->data + fw->size -
+			   sizeof(struct mtk_wed_fw_trailer));
+	dev_info(wo->hw->dev,
+		 "MTK WED WO Firmware Version: %.10s, Build Time: %.15s\n",
+		 trailer->fw_ver, trailer->build_date);
+	dev_info(wo->hw->dev, "MTK WED WO Chip ID %02x Region %d\n",
+		 trailer->chip_id, trailer->num_region);
+
+	for (i = 0; i < ARRAY_SIZE(mem_region); i++) {
+		ret = mtk_wed_mcu_run_firmware(wo, fw, &mem_region[i]);
+		if (ret)
+			goto out;
+	}
+
+	/* set the start address */
+	boot_cr = wo->hw->index ? MTK_WO_MCU_CFG_LS_WA_BOOT_ADDR_ADDR
+				: MTK_WO_MCU_CFG_LS_WM_BOOT_ADDR_ADDR;
+	wo_w32(wo, boot_cr, mem_region[MTK_WED_WO_REGION_EMI].phy_addr >> 16);
+	/* wo firmware reset */
+	wo_w32(wo, MTK_WO_MCU_CFG_LS_WF_MCCR_CLR_ADDR, 0xc00);
+
+	val = wo_r32(wo, MTK_WO_MCU_CFG_LS_WF_MCU_CFG_WM_WA_ADDR);
+	val |= wo->hw->index ? MTK_WO_MCU_CFG_LS_WF_WM_WA_WA_CPU_RSTB_MASK
+			     : MTK_WO_MCU_CFG_LS_WF_WM_WA_WM_CPU_RSTB_MASK;
+	wo_w32(wo, MTK_WO_MCU_CFG_LS_WF_MCU_CFG_WM_WA_ADDR, val);
+out:
+	release_firmware(fw);
+
+	return ret;
+}
+
+static u32
+mtk_wed_mcu_read_fw_dl(struct mtk_wed_wo *wo)
+{
+	return wed_r32(wo->hw->wed_dev,
+		       MTK_WED_SCR0 + 4 * MTK_WED_DUMMY_CR_FWDL);
+}
+
+int mtk_wed_mcu_init(struct mtk_wed_wo *wo)
+{
+	u32 val;
+	int ret;
+
+	skb_queue_head_init(&wo->mcu.res_q);
+	init_waitqueue_head(&wo->mcu.wait);
+	mutex_init(&wo->mcu.mutex);
+
+	ret = mtk_wed_mcu_load_firmware(wo);
+	if (ret)
+		return ret;
+
+	return readx_poll_timeout(mtk_wed_mcu_read_fw_dl, wo, val, !val,
+				  100, MTK_FW_DL_TIMEOUT);
+}
+
+MODULE_FIRMWARE(MT7986_FIRMWARE_WO0);
+MODULE_FIRMWARE(MT7986_FIRMWARE_WO1);
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_regs.h b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
index e270fb336143..c940b3bb215b 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_regs.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
@@ -152,6 +152,7 @@ struct mtk_wdma_desc {
 
 #define MTK_WED_RING_RX(_n)				(0x400 + (_n) * 0x10)
 
+#define MTK_WED_SCR0					0x3c0
 #define MTK_WED_WPDMA_INT_TRIGGER			0x504
 #define MTK_WED_WPDMA_INT_TRIGGER_RX_DONE		BIT(1)
 #define MTK_WED_WPDMA_INT_TRIGGER_TX_DONE		GENMASK(5, 4)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.h b/drivers/net/ethernet/mediatek/mtk_wed_wo.h
new file mode 100644
index 000000000000..5413a2308084
--- /dev/null
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2022 Lorenzo Bianconi <lorenzo@kernel.org>  */
+
+#ifndef __MTK_WED_WO_H
+#define __MTK_WED_WO_H
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+struct mtk_wed_hw;
+
+struct mtk_wed_mcu_hdr {
+	/* DW0 */
+	u8 version;
+	u8 cmd;
+	__le16 length;
+
+	/* DW1 */
+	__le16 seq;
+	__le16 flag;
+
+	/* DW2 */
+	__le32 status;
+
+	/* DW3 */
+	u8 rsv[20];
+};
+
+struct mtk_wed_wo_log_info {
+	__le32 sn;
+	__le32 total;
+	__le32 rro;
+	__le32 mod;
+};
+
+enum mtk_wed_wo_event {
+	MTK_WED_WO_EVT_LOG_DUMP		= 0x1,
+	MTK_WED_WO_EVT_PROFILING	= 0x2,
+	MTK_WED_WO_EVT_RXCNT_INFO	= 0x3,
+};
+
+#define MTK_WED_MODULE_ID_WO		1
+#define MTK_FW_DL_TIMEOUT		4000000 /* us */
+#define MTK_WOCPU_TIMEOUT		2000000 /* us */
+
+enum {
+	MTK_WED_WARP_CMD_FLAG_RSP		= BIT(0),
+	MTK_WED_WARP_CMD_FLAG_NEED_RSP		= BIT(1),
+	MTK_WED_WARP_CMD_FLAG_FROM_TO_WO	= BIT(2),
+};
+
+enum {
+	MTK_WED_WO_REGION_EMI,
+	MTK_WED_WO_REGION_ILM,
+	MTK_WED_WO_REGION_DATA,
+	MTK_WED_WO_REGION_BOOT,
+	__MTK_WED_WO_REGION_MAX,
+};
+
+enum mtk_wed_dummy_cr_idx {
+	MTK_WED_DUMMY_CR_FWDL,
+	MTK_WED_DUMMY_CR_WO_STATUS,
+};
+
+#define MT7986_FIRMWARE_WO0	"mediatek/mt7986_wo_0.bin"
+#define MT7986_FIRMWARE_WO1	"mediatek/mt7986_wo_1.bin"
+
+#define MTK_WO_MCU_CFG_LS_BASE				0
+#define MTK_WO_MCU_CFG_LS_HW_VER_ADDR			(MTK_WO_MCU_CFG_LS_BASE + 0x000)
+#define MTK_WO_MCU_CFG_LS_FW_VER_ADDR			(MTK_WO_MCU_CFG_LS_BASE + 0x004)
+#define MTK_WO_MCU_CFG_LS_CFG_DBG1_ADDR			(MTK_WO_MCU_CFG_LS_BASE + 0x00c)
+#define MTK_WO_MCU_CFG_LS_CFG_DBG2_ADDR			(MTK_WO_MCU_CFG_LS_BASE + 0x010)
+#define MTK_WO_MCU_CFG_LS_WF_MCCR_ADDR			(MTK_WO_MCU_CFG_LS_BASE + 0x014)
+#define MTK_WO_MCU_CFG_LS_WF_MCCR_SET_ADDR		(MTK_WO_MCU_CFG_LS_BASE + 0x018)
+#define MTK_WO_MCU_CFG_LS_WF_MCCR_CLR_ADDR		(MTK_WO_MCU_CFG_LS_BASE + 0x01c)
+#define MTK_WO_MCU_CFG_LS_WF_MCU_CFG_WM_WA_ADDR		(MTK_WO_MCU_CFG_LS_BASE + 0x050)
+#define MTK_WO_MCU_CFG_LS_WM_BOOT_ADDR_ADDR		(MTK_WO_MCU_CFG_LS_BASE + 0x060)
+#define MTK_WO_MCU_CFG_LS_WA_BOOT_ADDR_ADDR		(MTK_WO_MCU_CFG_LS_BASE + 0x064)
+
+#define MTK_WO_MCU_CFG_LS_WF_WM_WA_WM_CPU_RSTB_MASK	BIT(5)
+#define MTK_WO_MCU_CFG_LS_WF_WM_WA_WA_CPU_RSTB_MASK	BIT(0)
+
+struct mtk_wed_wo_memory_region {
+	const char *name;
+	void __iomem *addr;
+	phys_addr_t phy_addr;
+	u32 size;
+	bool shared:1;
+	bool consumed:1;
+};
+
+struct mtk_wed_fw_region {
+	__le32 decomp_crc;
+	__le32 decomp_len;
+	__le32 decomp_blk_sz;
+	u8 rsv0[4];
+	__le32 addr;
+	__le32 len;
+	u8 feature_set;
+	u8 rsv1[15];
+} __packed;
+
+struct mtk_wed_fw_trailer {
+	u8 chip_id;
+	u8 eco_code;
+	u8 num_region;
+	u8 format_ver;
+	u8 format_flag;
+	u8 rsv[2];
+	char fw_ver[10];
+	char build_date[15];
+	u32 crc;
+};
+
+struct mtk_wed_wo {
+	struct mtk_wed_hw *hw;
+	struct mtk_wed_wo_memory_region boot;
+
+	struct {
+		struct mutex mutex;
+		int timeout;
+		u16 seq;
+
+		struct sk_buff_head res_q;
+		wait_queue_head_t wait;
+	} mcu;
+};
+
+static inline int
+mtk_wed_mcu_check_msg(struct mtk_wed_wo *wo, struct sk_buff *skb)
+{
+	struct mtk_wed_mcu_hdr *hdr = (struct mtk_wed_mcu_hdr *)skb->data;
+
+	if (hdr->version)
+		return -EINVAL;
+
+	if (skb->len < sizeof(*hdr) || skb->len != le16_to_cpu(hdr->length))
+		return -EINVAL;
+
+	return 0;
+}
+
+void mtk_wed_mcu_rx_event(struct mtk_wed_wo *wo, struct sk_buff *skb);
+void mtk_wed_mcu_rx_unsolicited_event(struct mtk_wed_wo *wo,
+				      struct sk_buff *skb);
+int mtk_wed_mcu_send_msg(struct mtk_wed_wo *wo, int id, int cmd,
+			 const void *data, int len, bool wait_resp);
+int mtk_wed_mcu_init(struct mtk_wed_wo *wo);
+
+#endif /* __MTK_WED_WO_H */
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 4450c8b7a1cb..2cc2f1e43ba9 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -11,6 +11,35 @@
 struct mtk_wed_hw;
 struct mtk_wdma_desc;
 
+enum mtk_wed_wo_cmd {
+	MTK_WED_WO_CMD_WED_CFG,
+	MTK_WED_WO_CMD_WED_RX_STAT,
+	MTK_WED_WO_CMD_RRO_SER,
+	MTK_WED_WO_CMD_DBG_INFO,
+	MTK_WED_WO_CMD_DEV_INFO,
+	MTK_WED_WO_CMD_BSS_INFO,
+	MTK_WED_WO_CMD_STA_REC,
+	MTK_WED_WO_CMD_DEV_INFO_DUMP,
+	MTK_WED_WO_CMD_BSS_INFO_DUMP,
+	MTK_WED_WO_CMD_STA_REC_DUMP,
+	MTK_WED_WO_CMD_BA_INFO_DUMP,
+	MTK_WED_WO_CMD_FBCMD_Q_DUMP,
+	MTK_WED_WO_CMD_FW_LOG_CTRL,
+	MTK_WED_WO_CMD_LOG_FLUSH,
+	MTK_WED_WO_CMD_CHANGE_STATE,
+	MTK_WED_WO_CMD_CPU_STATS_ENABLE,
+	MTK_WED_WO_CMD_CPU_STATS_DUMP,
+	MTK_WED_WO_CMD_EXCEPTION_INIT,
+	MTK_WED_WO_CMD_PROF_CTRL,
+	MTK_WED_WO_CMD_STA_BA_DUMP,
+	MTK_WED_WO_CMD_BA_CTRL_DUMP,
+	MTK_WED_WO_CMD_RXCNT_CTRL,
+	MTK_WED_WO_CMD_RXCNT_INFO,
+	MTK_WED_WO_CMD_SET_CAP,
+	MTK_WED_WO_CMD_CCIF_RING_DUMP,
+	MTK_WED_WO_CMD_WED_END
+};
+
 enum mtk_wed_bus_tye {
 	MTK_WED_BUS_PCIE,
 	MTK_WED_BUS_AXI,
-- 
cgit v1.2.3


From 084d60ce0c6cef96024d53a58b92d7ff2d8b9318 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Sat, 5 Nov 2022 23:36:20 +0100
Subject: net: ethernet: mtk_wed: rename tx_wdma array in rx_wdma

Rename tx_wdma queue array in rx_wdma since this is rx side of wdma soc.
Moreover rename mtk_wed_wdma_ring_setup routine in
mtk_wed_wdma_rx_ring_setup()

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mediatek/mtk_wed.c | 16 ++++++++--------
 include/linux/soc/mediatek/mtk_wed.h    |  3 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index 9c9dd17332b6..e904596e67de 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -253,8 +253,8 @@ mtk_wed_free_tx_rings(struct mtk_wed_device *dev)
 
 	for (i = 0; i < ARRAY_SIZE(dev->tx_ring); i++)
 		mtk_wed_free_ring(dev, &dev->tx_ring[i]);
-	for (i = 0; i < ARRAY_SIZE(dev->tx_wdma); i++)
-		mtk_wed_free_ring(dev, &dev->tx_wdma[i]);
+	for (i = 0; i < ARRAY_SIZE(dev->rx_wdma); i++)
+		mtk_wed_free_ring(dev, &dev->rx_wdma[i]);
 }
 
 static void
@@ -688,10 +688,10 @@ mtk_wed_ring_alloc(struct mtk_wed_device *dev, struct mtk_wed_ring *ring,
 }
 
 static int
-mtk_wed_wdma_ring_setup(struct mtk_wed_device *dev, int idx, int size)
+mtk_wed_wdma_rx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
 {
 	u32 desc_size = sizeof(struct mtk_wdma_desc) * dev->hw->version;
-	struct mtk_wed_ring *wdma = &dev->tx_wdma[idx];
+	struct mtk_wed_ring *wdma = &dev->rx_wdma[idx];
 
 	if (mtk_wed_ring_alloc(dev, wdma, MTK_WED_WDMA_RING_SIZE, desc_size))
 		return -ENOMEM;
@@ -805,9 +805,9 @@ mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
 {
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(dev->tx_wdma); i++)
-		if (!dev->tx_wdma[i].desc)
-			mtk_wed_wdma_ring_setup(dev, i, 16);
+	for (i = 0; i < ARRAY_SIZE(dev->rx_wdma); i++)
+		if (!dev->rx_wdma[i].desc)
+			mtk_wed_wdma_rx_ring_setup(dev, i, 16);
 
 	mtk_wed_hw_init(dev);
 	mtk_wed_configure_irq(dev, irq_mask);
@@ -916,7 +916,7 @@ mtk_wed_tx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs)
 			       sizeof(*ring->desc)))
 		return -ENOMEM;
 
-	if (mtk_wed_wdma_ring_setup(dev, idx, MTK_WED_WDMA_RING_SIZE))
+	if (mtk_wed_wdma_rx_ring_setup(dev, idx, MTK_WED_WDMA_RING_SIZE))
 		return -ENOMEM;
 
 	ring->reg_base = MTK_WED_RING_TX(idx);
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 2cc2f1e43ba9..956978320f8b 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -7,6 +7,7 @@
 #include <linux/pci.h>
 
 #define MTK_WED_TX_QUEUES		2
+#define MTK_WED_RX_QUEUES		2
 
 struct mtk_wed_hw;
 struct mtk_wdma_desc;
@@ -66,7 +67,7 @@ struct mtk_wed_device {
 
 	struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES];
 	struct mtk_wed_ring txfree_ring;
-	struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES];
+	struct mtk_wed_ring rx_wdma[MTK_WED_RX_QUEUES];
 
 	struct {
 		int size;
-- 
cgit v1.2.3


From 4c5de09eb0d05fc9e73ff8f0e052622f1f3df6d8 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Sat, 5 Nov 2022 23:36:21 +0100
Subject: net: ethernet: mtk_wed: add configure wed wo support

Enable RX Wireless Ethernet Dispatch available on MT7986 Soc.

Tested-by: Daniel Golle <daniel@makrotopia.org>
Co-developed-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mediatek/mtk_wed.c      | 600 +++++++++++++++++++++++++--
 drivers/net/ethernet/mediatek/mtk_wed.h      |  19 +
 drivers/net/ethernet/mediatek/mtk_wed_mcu.c  |  45 +-
 drivers/net/ethernet/mediatek/mtk_wed_regs.h | 128 +++++-
 drivers/net/ethernet/mediatek/mtk_wed_wo.h   |  28 ++
 include/linux/soc/mediatek/mtk_wed.h         |  76 +++-
 6 files changed, 842 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index e904596e67de..7d8842378c2b 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -9,6 +9,7 @@
 #include <linux/skbuff.h>
 #include <linux/of_platform.h>
 #include <linux/of_address.h>
+#include <linux/of_reserved_mem.h>
 #include <linux/mfd/syscon.h>
 #include <linux/debugfs.h>
 #include <linux/soc/mediatek/mtk_wed.h>
@@ -23,6 +24,7 @@
 #define MTK_WED_PKT_SIZE		1900
 #define MTK_WED_BUF_SIZE		2048
 #define MTK_WED_BUF_PER_PAGE		(PAGE_SIZE / 2048)
+#define MTK_WED_RX_RING_SIZE		1536
 
 #define MTK_WED_TX_RING_SIZE		2048
 #define MTK_WED_WDMA_RING_SIZE		1024
@@ -31,6 +33,10 @@
 #define MTK_WED_PER_GROUP_PKT		128
 
 #define MTK_WED_FBUF_SIZE		128
+#define MTK_WED_MIOD_CNT		16
+#define MTK_WED_FB_CMD_CNT		1024
+#define MTK_WED_RRO_QUE_CNT		8192
+#define MTK_WED_MIOD_ENTRY_CNT		128
 
 static struct mtk_wed_hw *hw_list[2];
 static DEFINE_MUTEX(hw_lock);
@@ -65,12 +71,76 @@ wdma_set(struct mtk_wed_device *dev, u32 reg, u32 mask)
 	wdma_m32(dev, reg, 0, mask);
 }
 
+static void
+wdma_clr(struct mtk_wed_device *dev, u32 reg, u32 mask)
+{
+	wdma_m32(dev, reg, mask, 0);
+}
+
+static u32
+wifi_r32(struct mtk_wed_device *dev, u32 reg)
+{
+	return readl(dev->wlan.base + reg);
+}
+
+static void
+wifi_w32(struct mtk_wed_device *dev, u32 reg, u32 val)
+{
+	writel(val, dev->wlan.base + reg);
+}
+
 static u32
 mtk_wed_read_reset(struct mtk_wed_device *dev)
 {
 	return wed_r32(dev, MTK_WED_RESET);
 }
 
+static u32
+mtk_wdma_read_reset(struct mtk_wed_device *dev)
+{
+	return wdma_r32(dev, MTK_WDMA_GLO_CFG);
+}
+
+static void
+mtk_wdma_rx_reset(struct mtk_wed_device *dev)
+{
+	u32 status, mask = MTK_WDMA_GLO_CFG_RX_DMA_BUSY;
+	int i;
+
+	wdma_clr(dev, MTK_WDMA_GLO_CFG, MTK_WDMA_GLO_CFG_RX_DMA_EN);
+	if (readx_poll_timeout(mtk_wdma_read_reset, dev, status,
+			       !(status & mask), 0, 1000))
+		dev_err(dev->hw->dev, "rx reset failed\n");
+
+	for (i = 0; i < ARRAY_SIZE(dev->rx_wdma); i++) {
+		if (dev->rx_wdma[i].desc)
+			continue;
+
+		wdma_w32(dev,
+			 MTK_WDMA_RING_RX(i) + MTK_WED_RING_OFS_CPU_IDX, 0);
+	}
+}
+
+static void
+mtk_wdma_tx_reset(struct mtk_wed_device *dev)
+{
+	u32 status, mask = MTK_WDMA_GLO_CFG_TX_DMA_BUSY;
+	int i;
+
+	wdma_clr(dev, MTK_WDMA_GLO_CFG, MTK_WDMA_GLO_CFG_TX_DMA_EN);
+	if (readx_poll_timeout(mtk_wdma_read_reset, dev, status,
+			       !(status & mask), 0, 1000))
+		dev_err(dev->hw->dev, "tx reset failed\n");
+
+	for (i = 0; i < ARRAY_SIZE(dev->tx_wdma); i++) {
+		if (dev->tx_wdma[i].desc)
+			continue;
+
+		wdma_w32(dev,
+			 MTK_WDMA_RING_TX(i) + MTK_WED_RING_OFS_CPU_IDX, 0);
+	}
+}
+
 static void
 mtk_wed_reset(struct mtk_wed_device *dev, u32 mask)
 {
@@ -82,6 +152,54 @@ mtk_wed_reset(struct mtk_wed_device *dev, u32 mask)
 		WARN_ON_ONCE(1);
 }
 
+static u32
+mtk_wed_wo_read_status(struct mtk_wed_device *dev)
+{
+	return wed_r32(dev, MTK_WED_SCR0 + 4 * MTK_WED_DUMMY_CR_WO_STATUS);
+}
+
+static void
+mtk_wed_wo_reset(struct mtk_wed_device *dev)
+{
+	struct mtk_wed_wo *wo = dev->hw->wed_wo;
+	u8 state = MTK_WED_WO_STATE_DISABLE;
+	void __iomem *reg;
+	u32 val;
+
+	mtk_wdma_tx_reset(dev);
+	mtk_wed_reset(dev, MTK_WED_RESET_WED);
+
+	mtk_wed_mcu_send_msg(wo, MTK_WED_MODULE_ID_WO,
+			     MTK_WED_WO_CMD_CHANGE_STATE, &state,
+			     sizeof(state), false);
+
+	if (readx_poll_timeout(mtk_wed_wo_read_status, dev, val,
+			       val == MTK_WED_WOIF_DISABLE_DONE,
+			       100, MTK_WOCPU_TIMEOUT))
+		dev_err(dev->hw->dev, "failed to disable wed-wo\n");
+
+	reg = ioremap(MTK_WED_WO_CPU_MCUSYS_RESET_ADDR, 4);
+
+	val = readl(reg);
+	switch (dev->hw->index) {
+	case 0:
+		val |= MTK_WED_WO_CPU_WO0_MCUSYS_RESET_MASK;
+		writel(val, reg);
+		val &= ~MTK_WED_WO_CPU_WO0_MCUSYS_RESET_MASK;
+		writel(val, reg);
+		break;
+	case 1:
+		val |= MTK_WED_WO_CPU_WO1_MCUSYS_RESET_MASK;
+		writel(val, reg);
+		val &= ~MTK_WED_WO_CPU_WO1_MCUSYS_RESET_MASK;
+		writel(val, reg);
+		break;
+	default:
+		break;
+	}
+	iounmap(reg);
+}
+
 static struct mtk_wed_hw *
 mtk_wed_assign(struct mtk_wed_device *dev)
 {
@@ -116,7 +234,7 @@ out:
 }
 
 static int
-mtk_wed_buffer_alloc(struct mtk_wed_device *dev)
+mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 {
 	struct mtk_wdma_desc *desc;
 	dma_addr_t desc_phys;
@@ -133,16 +251,16 @@ mtk_wed_buffer_alloc(struct mtk_wed_device *dev)
 	if (!page_list)
 		return -ENOMEM;
 
-	dev->buf_ring.size = ring_size;
-	dev->buf_ring.pages = page_list;
+	dev->tx_buf_ring.size = ring_size;
+	dev->tx_buf_ring.pages = page_list;
 
 	desc = dma_alloc_coherent(dev->hw->dev, ring_size * sizeof(*desc),
 				  &desc_phys, GFP_KERNEL);
 	if (!desc)
 		return -ENOMEM;
 
-	dev->buf_ring.desc = desc;
-	dev->buf_ring.desc_phys = desc_phys;
+	dev->tx_buf_ring.desc = desc;
+	dev->tx_buf_ring.desc_phys = desc_phys;
 
 	for (i = 0, page_idx = 0; i < ring_size; i += MTK_WED_BUF_PER_PAGE) {
 		dma_addr_t page_phys, buf_phys;
@@ -203,10 +321,10 @@ mtk_wed_buffer_alloc(struct mtk_wed_device *dev)
 }
 
 static void
-mtk_wed_free_buffer(struct mtk_wed_device *dev)
+mtk_wed_free_tx_buffer(struct mtk_wed_device *dev)
 {
-	struct mtk_wdma_desc *desc = dev->buf_ring.desc;
-	void **page_list = dev->buf_ring.pages;
+	struct mtk_wdma_desc *desc = dev->tx_buf_ring.desc;
+	void **page_list = dev->tx_buf_ring.pages;
 	int page_idx;
 	int i;
 
@@ -216,7 +334,8 @@ mtk_wed_free_buffer(struct mtk_wed_device *dev)
 	if (!desc)
 		goto free_pagelist;
 
-	for (i = 0, page_idx = 0; i < dev->buf_ring.size; i += MTK_WED_BUF_PER_PAGE) {
+	for (i = 0, page_idx = 0; i < dev->tx_buf_ring.size;
+	     i += MTK_WED_BUF_PER_PAGE) {
 		void *page = page_list[page_idx++];
 		dma_addr_t buf_addr;
 
@@ -229,13 +348,59 @@ mtk_wed_free_buffer(struct mtk_wed_device *dev)
 		__free_page(page);
 	}
 
-	dma_free_coherent(dev->hw->dev, dev->buf_ring.size * sizeof(*desc),
-			  desc, dev->buf_ring.desc_phys);
+	dma_free_coherent(dev->hw->dev, dev->tx_buf_ring.size * sizeof(*desc),
+			  desc, dev->tx_buf_ring.desc_phys);
 
 free_pagelist:
 	kfree(page_list);
 }
 
+static int
+mtk_wed_rx_buffer_alloc(struct mtk_wed_device *dev)
+{
+	struct mtk_rxbm_desc *desc;
+	dma_addr_t desc_phys;
+
+	dev->rx_buf_ring.size = dev->wlan.rx_nbuf;
+	desc = dma_alloc_coherent(dev->hw->dev,
+				  dev->wlan.rx_nbuf * sizeof(*desc),
+				  &desc_phys, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	dev->rx_buf_ring.desc = desc;
+	dev->rx_buf_ring.desc_phys = desc_phys;
+	dev->wlan.init_rx_buf(dev, dev->wlan.rx_npkt);
+
+	return 0;
+}
+
+static void
+mtk_wed_free_rx_buffer(struct mtk_wed_device *dev)
+{
+	struct mtk_rxbm_desc *desc = dev->rx_buf_ring.desc;
+
+	if (!desc)
+		return;
+
+	dev->wlan.release_rx_buf(dev);
+	dma_free_coherent(dev->hw->dev, dev->rx_buf_ring.size * sizeof(*desc),
+			  desc, dev->rx_buf_ring.desc_phys);
+}
+
+static void
+mtk_wed_rx_buffer_hw_init(struct mtk_wed_device *dev)
+{
+	wed_w32(dev, MTK_WED_RX_BM_RX_DMAD,
+		FIELD_PREP(MTK_WED_RX_BM_RX_DMAD_SDL0, dev->wlan.rx_size));
+	wed_w32(dev, MTK_WED_RX_BM_BASE, dev->rx_buf_ring.desc_phys);
+	wed_w32(dev, MTK_WED_RX_BM_INIT_PTR, MTK_WED_RX_BM_INIT_SW_TAIL |
+		FIELD_PREP(MTK_WED_RX_BM_SW_TAIL, dev->wlan.rx_npkt));
+	wed_w32(dev, MTK_WED_RX_BM_DYN_ALLOC_TH,
+		FIELD_PREP(MTK_WED_RX_BM_DYN_ALLOC_TH_H, 0xffff));
+	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_WED_RX_BM_EN);
+}
+
 static void
 mtk_wed_free_ring(struct mtk_wed_device *dev, struct mtk_wed_ring *ring)
 {
@@ -246,6 +411,13 @@ mtk_wed_free_ring(struct mtk_wed_device *dev, struct mtk_wed_ring *ring)
 			  ring->desc, ring->desc_phys);
 }
 
+static void
+mtk_wed_free_rx_rings(struct mtk_wed_device *dev)
+{
+	mtk_wed_free_rx_buffer(dev);
+	mtk_wed_free_ring(dev, &dev->rro.ring);
+}
+
 static void
 mtk_wed_free_tx_rings(struct mtk_wed_device *dev)
 {
@@ -291,6 +463,38 @@ mtk_wed_set_512_support(struct mtk_wed_device *dev, bool enable)
 	}
 }
 
+#define MTK_WFMDA_RX_DMA_EN	BIT(2)
+static void
+mtk_wed_check_wfdma_rx_fill(struct mtk_wed_device *dev, int idx)
+{
+	u32 val;
+	int i;
+
+	if (!(dev->rx_ring[idx].flags & MTK_WED_RING_CONFIGURED))
+		return; /* queue is not configured by mt76 */
+
+	for (i = 0; i < 3; i++) {
+		u32 cur_idx;
+
+		cur_idx = wed_r32(dev,
+				  MTK_WED_WPDMA_RING_RX_DATA(idx) +
+				  MTK_WED_RING_OFS_CPU_IDX);
+		if (cur_idx == MTK_WED_RX_RING_SIZE - 1)
+			break;
+
+		usleep_range(100000, 200000);
+	}
+
+	if (i == 3) {
+		dev_err(dev->hw->dev, "rx dma enable failed\n");
+		return;
+	}
+
+	val = wifi_r32(dev, dev->wlan.wpdma_rx_glo - dev->wlan.phy_base) |
+	      MTK_WFMDA_RX_DMA_EN;
+	wifi_w32(dev, dev->wlan.wpdma_rx_glo - dev->wlan.phy_base, val);
+}
+
 static void
 mtk_wed_dma_disable(struct mtk_wed_device *dev)
 {
@@ -304,20 +508,25 @@ mtk_wed_dma_disable(struct mtk_wed_device *dev)
 		MTK_WED_GLO_CFG_TX_DMA_EN |
 		MTK_WED_GLO_CFG_RX_DMA_EN);
 
-	wdma_m32(dev, MTK_WDMA_GLO_CFG,
+	wdma_clr(dev, MTK_WDMA_GLO_CFG,
 		 MTK_WDMA_GLO_CFG_TX_DMA_EN |
 		 MTK_WDMA_GLO_CFG_RX_INFO1_PRERES |
-		 MTK_WDMA_GLO_CFG_RX_INFO2_PRERES, 0);
+		 MTK_WDMA_GLO_CFG_RX_INFO2_PRERES);
 
 	if (dev->hw->version == 1) {
 		regmap_write(dev->hw->mirror, dev->hw->index * 4, 0);
-		wdma_m32(dev, MTK_WDMA_GLO_CFG,
-			 MTK_WDMA_GLO_CFG_RX_INFO3_PRERES, 0);
+		wdma_clr(dev, MTK_WDMA_GLO_CFG,
+			 MTK_WDMA_GLO_CFG_RX_INFO3_PRERES);
 	} else {
 		wed_clr(dev, MTK_WED_WPDMA_GLO_CFG,
 			MTK_WED_WPDMA_GLO_CFG_RX_DRV_R0_PKT_PROC |
 			MTK_WED_WPDMA_GLO_CFG_RX_DRV_R0_CRX_SYNC);
 
+		wed_clr(dev, MTK_WED_WPDMA_RX_D_GLO_CFG,
+			MTK_WED_WPDMA_RX_D_RX_DRV_EN);
+		wed_clr(dev, MTK_WED_WDMA_GLO_CFG,
+			MTK_WED_WDMA_GLO_CFG_TX_DDONE_CHK);
+
 		mtk_wed_set_512_support(dev, false);
 	}
 }
@@ -338,6 +547,13 @@ mtk_wed_stop(struct mtk_wed_device *dev)
 	wdma_w32(dev, MTK_WDMA_INT_MASK, 0);
 	wdma_w32(dev, MTK_WDMA_INT_GRP2, 0);
 	wed_w32(dev, MTK_WED_WPDMA_INT_MASK, 0);
+
+	if (dev->hw->version == 1)
+		return;
+
+	wed_w32(dev, MTK_WED_EXT_INT_MASK1, 0);
+	wed_w32(dev, MTK_WED_EXT_INT_MASK2, 0);
+	wed_clr(dev, MTK_WED_CTRL, MTK_WED_CTRL_WED_RX_BM_EN);
 }
 
 static void
@@ -353,11 +569,21 @@ mtk_wed_detach(struct mtk_wed_device *dev)
 	wdma_w32(dev, MTK_WDMA_RESET_IDX, 0);
 
 	mtk_wed_reset(dev, MTK_WED_RESET_WED);
+	if (mtk_wed_get_rx_capa(dev)) {
+		wdma_clr(dev, MTK_WDMA_GLO_CFG, MTK_WDMA_GLO_CFG_TX_DMA_EN);
+		wdma_w32(dev, MTK_WDMA_RESET_IDX, MTK_WDMA_RESET_IDX_TX);
+		wdma_w32(dev, MTK_WDMA_RESET_IDX, 0);
+	}
 
-	mtk_wed_free_buffer(dev);
+	mtk_wed_free_tx_buffer(dev);
 	mtk_wed_free_tx_rings(dev);
-	if (hw->version != 1)
+
+	if (mtk_wed_get_rx_capa(dev)) {
+		mtk_wed_wo_reset(dev);
+		mtk_wed_free_rx_rings(dev);
 		mtk_wed_wo_deinit(hw);
+		mtk_wdma_rx_reset(dev);
+	}
 
 	if (dev->wlan.bus_type == MTK_WED_BUS_PCIE) {
 		struct device_node *wlan_node;
@@ -434,10 +660,12 @@ mtk_wed_set_wpdma(struct mtk_wed_device *dev)
 	} else {
 		mtk_wed_bus_init(dev);
 
-		wed_w32(dev, MTK_WED_WPDMA_CFG_BASE,  dev->wlan.wpdma_int);
-		wed_w32(dev, MTK_WED_WPDMA_CFG_INT_MASK,  dev->wlan.wpdma_mask);
-		wed_w32(dev, MTK_WED_WPDMA_CFG_TX,  dev->wlan.wpdma_tx);
-		wed_w32(dev, MTK_WED_WPDMA_CFG_TX_FREE,  dev->wlan.wpdma_txfree);
+		wed_w32(dev, MTK_WED_WPDMA_CFG_BASE, dev->wlan.wpdma_int);
+		wed_w32(dev, MTK_WED_WPDMA_CFG_INT_MASK, dev->wlan.wpdma_mask);
+		wed_w32(dev, MTK_WED_WPDMA_CFG_TX, dev->wlan.wpdma_tx);
+		wed_w32(dev, MTK_WED_WPDMA_CFG_TX_FREE, dev->wlan.wpdma_txfree);
+		wed_w32(dev, MTK_WED_WPDMA_RX_GLO_CFG, dev->wlan.wpdma_rx_glo);
+		wed_w32(dev, MTK_WED_WPDMA_RX_RING, dev->wlan.wpdma_rx);
 	}
 }
 
@@ -487,6 +715,132 @@ mtk_wed_hw_init_early(struct mtk_wed_device *dev)
 	}
 }
 
+static int
+mtk_wed_rro_ring_alloc(struct mtk_wed_device *dev, struct mtk_wed_ring *ring,
+		       int size)
+{
+	ring->desc = dma_alloc_coherent(dev->hw->dev,
+					size * sizeof(*ring->desc),
+					&ring->desc_phys, GFP_KERNEL);
+	if (!ring->desc)
+		return -ENOMEM;
+
+	ring->desc_size = sizeof(*ring->desc);
+	ring->size = size;
+	memset(ring->desc, 0, size);
+
+	return 0;
+}
+
+#define MTK_WED_MIOD_COUNT	(MTK_WED_MIOD_ENTRY_CNT * MTK_WED_MIOD_CNT)
+static int
+mtk_wed_rro_alloc(struct mtk_wed_device *dev)
+{
+	struct reserved_mem *rmem;
+	struct device_node *np;
+	int index;
+
+	index = of_property_match_string(dev->hw->node, "memory-region-names",
+					 "wo-dlm");
+	if (index < 0)
+		return index;
+
+	np = of_parse_phandle(dev->hw->node, "memory-region", index);
+	if (!np)
+		return -ENODEV;
+
+	rmem = of_reserved_mem_lookup(np);
+	of_node_put(np);
+
+	if (!rmem)
+		return -ENODEV;
+
+	dev->rro.miod_phys = rmem->base;
+	dev->rro.fdbk_phys = MTK_WED_MIOD_COUNT + dev->rro.miod_phys;
+
+	return mtk_wed_rro_ring_alloc(dev, &dev->rro.ring,
+				      MTK_WED_RRO_QUE_CNT);
+}
+
+static int
+mtk_wed_rro_cfg(struct mtk_wed_device *dev)
+{
+	struct mtk_wed_wo *wo = dev->hw->wed_wo;
+	struct {
+		struct {
+			__le32 base;
+			__le32 cnt;
+			__le32 unit;
+		} ring[2];
+		__le32 wed;
+		u8 version;
+	} req = {
+		.ring[0] = {
+			.base = cpu_to_le32(MTK_WED_WOCPU_VIEW_MIOD_BASE),
+			.cnt = cpu_to_le32(MTK_WED_MIOD_CNT),
+			.unit = cpu_to_le32(MTK_WED_MIOD_ENTRY_CNT),
+		},
+		.ring[1] = {
+			.base = cpu_to_le32(MTK_WED_WOCPU_VIEW_MIOD_BASE +
+					    MTK_WED_MIOD_COUNT),
+			.cnt = cpu_to_le32(MTK_WED_FB_CMD_CNT),
+			.unit = cpu_to_le32(4),
+		},
+	};
+
+	return mtk_wed_mcu_send_msg(wo, MTK_WED_MODULE_ID_WO,
+				    MTK_WED_WO_CMD_WED_CFG,
+				    &req, sizeof(req), true);
+}
+
+static void
+mtk_wed_rro_hw_init(struct mtk_wed_device *dev)
+{
+	wed_w32(dev, MTK_WED_RROQM_MIOD_CFG,
+		FIELD_PREP(MTK_WED_RROQM_MIOD_MID_DW, 0x70 >> 2) |
+		FIELD_PREP(MTK_WED_RROQM_MIOD_MOD_DW, 0x10 >> 2) |
+		FIELD_PREP(MTK_WED_RROQM_MIOD_ENTRY_DW,
+			   MTK_WED_MIOD_ENTRY_CNT >> 2));
+
+	wed_w32(dev, MTK_WED_RROQM_MIOD_CTRL0, dev->rro.miod_phys);
+	wed_w32(dev, MTK_WED_RROQM_MIOD_CTRL1,
+		FIELD_PREP(MTK_WED_RROQM_MIOD_CNT, MTK_WED_MIOD_CNT));
+	wed_w32(dev, MTK_WED_RROQM_FDBK_CTRL0, dev->rro.fdbk_phys);
+	wed_w32(dev, MTK_WED_RROQM_FDBK_CTRL1,
+		FIELD_PREP(MTK_WED_RROQM_FDBK_CNT, MTK_WED_FB_CMD_CNT));
+	wed_w32(dev, MTK_WED_RROQM_FDBK_CTRL2, 0);
+	wed_w32(dev, MTK_WED_RROQ_BASE_L, dev->rro.ring.desc_phys);
+
+	wed_set(dev, MTK_WED_RROQM_RST_IDX,
+		MTK_WED_RROQM_RST_IDX_MIOD |
+		MTK_WED_RROQM_RST_IDX_FDBK);
+
+	wed_w32(dev, MTK_WED_RROQM_RST_IDX, 0);
+	wed_w32(dev, MTK_WED_RROQM_MIOD_CTRL2, MTK_WED_MIOD_CNT - 1);
+	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_RX_RRO_QM_EN);
+}
+
+static void
+mtk_wed_route_qm_hw_init(struct mtk_wed_device *dev)
+{
+	wed_w32(dev, MTK_WED_RESET, MTK_WED_RESET_RX_ROUTE_QM);
+
+	for (;;) {
+		usleep_range(100, 200);
+		if (!(wed_r32(dev, MTK_WED_RESET) & MTK_WED_RESET_RX_ROUTE_QM))
+			break;
+	}
+
+	/* configure RX_ROUTE_QM */
+	wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_Q_RST);
+	wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_TXDMAD_FPORT);
+	wed_set(dev, MTK_WED_RTQM_GLO_CFG,
+		FIELD_PREP(MTK_WED_RTQM_TXDMAD_FPORT, 0x3 + dev->hw->index));
+	wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_Q_RST);
+	/* enable RX_ROUTE_QM */
+	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_RX_ROUTE_QM_EN);
+}
+
 static void
 mtk_wed_hw_init(struct mtk_wed_device *dev)
 {
@@ -498,11 +852,11 @@ mtk_wed_hw_init(struct mtk_wed_device *dev)
 	wed_w32(dev, MTK_WED_TX_BM_CTRL,
 		MTK_WED_TX_BM_CTRL_PAUSE |
 		FIELD_PREP(MTK_WED_TX_BM_CTRL_VLD_GRP_NUM,
-			   dev->buf_ring.size / 128) |
+			   dev->tx_buf_ring.size / 128) |
 		FIELD_PREP(MTK_WED_TX_BM_CTRL_RSV_GRP_NUM,
 			   MTK_WED_TX_RING_SIZE / 256));
 
-	wed_w32(dev, MTK_WED_TX_BM_BASE, dev->buf_ring.desc_phys);
+	wed_w32(dev, MTK_WED_TX_BM_BASE, dev->tx_buf_ring.desc_phys);
 
 	wed_w32(dev, MTK_WED_TX_BM_BUF_LEN, MTK_WED_PKT_SIZE);
 
@@ -529,9 +883,9 @@ mtk_wed_hw_init(struct mtk_wed_device *dev)
 		wed_w32(dev, MTK_WED_TX_TKID_CTRL,
 			MTK_WED_TX_TKID_CTRL_PAUSE |
 			FIELD_PREP(MTK_WED_TX_TKID_CTRL_VLD_GRP_NUM,
-				   dev->buf_ring.size / 128) |
+				   dev->tx_buf_ring.size / 128) |
 			FIELD_PREP(MTK_WED_TX_TKID_CTRL_RSV_GRP_NUM,
-				   dev->buf_ring.size / 128));
+				   dev->tx_buf_ring.size / 128));
 		wed_w32(dev, MTK_WED_TX_TKID_DYN_THR,
 			FIELD_PREP(MTK_WED_TX_TKID_DYN_THR_LO, 0) |
 			MTK_WED_TX_TKID_DYN_THR_HI);
@@ -539,18 +893,28 @@ mtk_wed_hw_init(struct mtk_wed_device *dev)
 
 	mtk_wed_reset(dev, MTK_WED_RESET_TX_BM);
 
-	if (dev->hw->version == 1)
+	if (dev->hw->version == 1) {
 		wed_set(dev, MTK_WED_CTRL,
 			MTK_WED_CTRL_WED_TX_BM_EN |
 			MTK_WED_CTRL_WED_TX_FREE_AGENT_EN);
-	else
+	} else {
 		wed_clr(dev, MTK_WED_TX_TKID_CTRL, MTK_WED_TX_TKID_CTRL_PAUSE);
+		/* rx hw init */
+		wed_w32(dev, MTK_WED_WPDMA_RX_D_RST_IDX,
+			MTK_WED_WPDMA_RX_D_RST_CRX_IDX |
+			MTK_WED_WPDMA_RX_D_RST_DRV_IDX);
+		wed_w32(dev, MTK_WED_WPDMA_RX_D_RST_IDX, 0);
+
+		mtk_wed_rx_buffer_hw_init(dev);
+		mtk_wed_rro_hw_init(dev);
+		mtk_wed_route_qm_hw_init(dev);
+	}
 
 	wed_clr(dev, MTK_WED_TX_BM_CTRL, MTK_WED_TX_BM_CTRL_PAUSE);
 }
 
 static void
-mtk_wed_ring_reset(struct mtk_wed_ring *ring, int size)
+mtk_wed_ring_reset(struct mtk_wed_ring *ring, int size, bool tx)
 {
 	void *head = (void *)ring->desc;
 	int i;
@@ -560,7 +924,10 @@ mtk_wed_ring_reset(struct mtk_wed_ring *ring, int size)
 
 		desc = (struct mtk_wdma_desc *)(head + i * ring->desc_size);
 		desc->buf0 = 0;
-		desc->ctrl = cpu_to_le32(MTK_WDMA_DESC_CTRL_DMA_DONE);
+		if (tx)
+			desc->ctrl = cpu_to_le32(MTK_WDMA_DESC_CTRL_DMA_DONE);
+		else
+			desc->ctrl = cpu_to_le32(MTK_WFDMA_DESC_CTRL_TO_HOST);
 		desc->buf1 = 0;
 		desc->info = 0;
 	}
@@ -616,7 +983,8 @@ mtk_wed_reset_dma(struct mtk_wed_device *dev)
 		if (!dev->tx_ring[i].desc)
 			continue;
 
-		mtk_wed_ring_reset(&dev->tx_ring[i], MTK_WED_TX_RING_SIZE);
+		mtk_wed_ring_reset(&dev->tx_ring[i], MTK_WED_TX_RING_SIZE,
+				   true);
 	}
 
 	if (mtk_wed_poll_busy(dev))
@@ -634,6 +1002,9 @@ mtk_wed_reset_dma(struct mtk_wed_device *dev)
 	wdma_w32(dev, MTK_WDMA_RESET_IDX, MTK_WDMA_RESET_IDX_RX);
 	wdma_w32(dev, MTK_WDMA_RESET_IDX, 0);
 
+	if (mtk_wed_get_rx_capa(dev))
+		mtk_wdma_rx_reset(dev);
+
 	if (busy) {
 		mtk_wed_reset(dev, MTK_WED_RESET_WDMA_INT_AGENT);
 		mtk_wed_reset(dev, MTK_WED_RESET_WDMA_RX_DRV);
@@ -668,12 +1039,11 @@ mtk_wed_reset_dma(struct mtk_wed_device *dev)
 			MTK_WED_WPDMA_RESET_IDX_RX);
 		wed_w32(dev, MTK_WED_WPDMA_RESET_IDX, 0);
 	}
-
 }
 
 static int
 mtk_wed_ring_alloc(struct mtk_wed_device *dev, struct mtk_wed_ring *ring,
-		   int size, u32 desc_size)
+		   int size, u32 desc_size, bool tx)
 {
 	ring->desc = dma_alloc_coherent(dev->hw->dev, size * desc_size,
 					&ring->desc_phys, GFP_KERNEL);
@@ -682,7 +1052,7 @@ mtk_wed_ring_alloc(struct mtk_wed_device *dev, struct mtk_wed_ring *ring,
 
 	ring->desc_size = desc_size;
 	ring->size = size;
-	mtk_wed_ring_reset(ring, size);
+	mtk_wed_ring_reset(ring, size, tx);
 
 	return 0;
 }
@@ -691,9 +1061,14 @@ static int
 mtk_wed_wdma_rx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
 {
 	u32 desc_size = sizeof(struct mtk_wdma_desc) * dev->hw->version;
-	struct mtk_wed_ring *wdma = &dev->rx_wdma[idx];
+	struct mtk_wed_ring *wdma;
 
-	if (mtk_wed_ring_alloc(dev, wdma, MTK_WED_WDMA_RING_SIZE, desc_size))
+	if (idx >= ARRAY_SIZE(dev->rx_wdma))
+		return -EINVAL;
+
+	wdma = &dev->rx_wdma[idx];
+	if (mtk_wed_ring_alloc(dev, wdma, MTK_WED_WDMA_RING_SIZE, desc_size,
+			       true))
 		return -ENOMEM;
 
 	wdma_w32(dev, MTK_WDMA_RING_RX(idx) + MTK_WED_RING_OFS_BASE,
@@ -710,6 +1085,60 @@ mtk_wed_wdma_rx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
 	return 0;
 }
 
+static int
+mtk_wed_wdma_tx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
+{
+	u32 desc_size = sizeof(struct mtk_wdma_desc) * dev->hw->version;
+	struct mtk_wed_ring *wdma;
+
+	if (idx >= ARRAY_SIZE(dev->tx_wdma))
+		return -EINVAL;
+
+	wdma = &dev->tx_wdma[idx];
+	if (mtk_wed_ring_alloc(dev, wdma, MTK_WED_WDMA_RING_SIZE, desc_size,
+			       true))
+		return -ENOMEM;
+
+	wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_BASE,
+		 wdma->desc_phys);
+	wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_COUNT,
+		 size);
+	wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_CPU_IDX, 0);
+	wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_DMA_IDX, 0);
+
+	if (!idx)  {
+		wed_w32(dev, MTK_WED_WDMA_RING_TX + MTK_WED_RING_OFS_BASE,
+			wdma->desc_phys);
+		wed_w32(dev, MTK_WED_WDMA_RING_TX + MTK_WED_RING_OFS_COUNT,
+			size);
+		wed_w32(dev, MTK_WED_WDMA_RING_TX + MTK_WED_RING_OFS_CPU_IDX,
+			0);
+		wed_w32(dev, MTK_WED_WDMA_RING_TX + MTK_WED_RING_OFS_DMA_IDX,
+			0);
+	}
+
+	return 0;
+}
+
+static void
+mtk_wed_ppe_check(struct mtk_wed_device *dev, struct sk_buff *skb,
+		  u32 reason, u32 hash)
+{
+	struct mtk_eth *eth = dev->hw->eth;
+	struct ethhdr *eh;
+
+	if (!skb)
+		return;
+
+	if (reason != MTK_PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED)
+		return;
+
+	skb_set_mac_header(skb, 0);
+	eh = eth_hdr(skb);
+	skb->protocol = eh->h_proto;
+	mtk_ppe_check_skb(eth->ppe[dev->hw->index], skb, hash);
+}
+
 static void
 mtk_wed_configure_irq(struct mtk_wed_device *dev, u32 irq_mask)
 {
@@ -732,6 +1161,8 @@ mtk_wed_configure_irq(struct mtk_wed_device *dev, u32 irq_mask)
 
 		wed_clr(dev, MTK_WED_WDMA_INT_CTRL, wdma_mask);
 	} else {
+		wdma_mask |= FIELD_PREP(MTK_WDMA_INT_MASK_TX_DONE,
+					GENMASK(1, 0));
 		/* initail tx interrupt trigger */
 		wed_w32(dev, MTK_WED_WPDMA_INT_CTRL_TX,
 			MTK_WED_WPDMA_INT_CTRL_TX0_DONE_EN |
@@ -750,6 +1181,16 @@ mtk_wed_configure_irq(struct mtk_wed_device *dev, u32 irq_mask)
 			FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_TX_FREE_DONE_TRIG,
 				   dev->wlan.txfree_tbit));
 
+		wed_w32(dev, MTK_WED_WPDMA_INT_CTRL_RX,
+			MTK_WED_WPDMA_INT_CTRL_RX0_EN |
+			MTK_WED_WPDMA_INT_CTRL_RX0_CLR |
+			MTK_WED_WPDMA_INT_CTRL_RX1_EN |
+			MTK_WED_WPDMA_INT_CTRL_RX1_CLR |
+			FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RX0_DONE_TRIG,
+				   dev->wlan.rx_tbit[0]) |
+			FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RX1_DONE_TRIG,
+				   dev->wlan.rx_tbit[1]));
+
 		wed_w32(dev, MTK_WED_WDMA_INT_CLR, wdma_mask);
 		wed_set(dev, MTK_WED_WDMA_INT_CTRL,
 			FIELD_PREP(MTK_WED_WDMA_INT_CTRL_POLL_SRC_SEL,
@@ -787,9 +1228,15 @@ mtk_wed_dma_enable(struct mtk_wed_device *dev)
 		wdma_set(dev, MTK_WDMA_GLO_CFG,
 			 MTK_WDMA_GLO_CFG_RX_INFO3_PRERES);
 	} else {
+		int i;
+
 		wed_set(dev, MTK_WED_WPDMA_CTRL,
 			MTK_WED_WPDMA_CTRL_SDL1_FIXED);
 
+		wed_set(dev, MTK_WED_WDMA_GLO_CFG,
+			MTK_WED_WDMA_GLO_CFG_TX_DRV_EN |
+			MTK_WED_WDMA_GLO_CFG_TX_DDONE_CHK);
+
 		wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
 			MTK_WED_WPDMA_GLO_CFG_RX_DRV_R0_PKT_PROC |
 			MTK_WED_WPDMA_GLO_CFG_RX_DRV_R0_CRX_SYNC);
@@ -797,6 +1244,15 @@ mtk_wed_dma_enable(struct mtk_wed_device *dev)
 		wed_clr(dev, MTK_WED_WPDMA_GLO_CFG,
 			MTK_WED_WPDMA_GLO_CFG_TX_TKID_KEEP |
 			MTK_WED_WPDMA_GLO_CFG_TX_DMAD_DW3_PREV);
+
+		wed_set(dev, MTK_WED_WPDMA_RX_D_GLO_CFG,
+			MTK_WED_WPDMA_RX_D_RX_DRV_EN |
+			FIELD_PREP(MTK_WED_WPDMA_RX_D_RXD_READ_LEN, 0x18) |
+			FIELD_PREP(MTK_WED_WPDMA_RX_D_INIT_PHASE_RXEN_SEL,
+				   0x2));
+
+		for (i = 0; i < MTK_WED_RX_QUEUES; i++)
+			mtk_wed_check_wfdma_rx_fill(dev, i);
 	}
 }
 
@@ -822,7 +1278,19 @@ mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
 		val |= BIT(0) | (BIT(1) * !!dev->hw->index);
 		regmap_write(dev->hw->mirror, dev->hw->index * 4, val);
 	} else {
-		mtk_wed_set_512_support(dev, true);
+		/* driver set mid ready and only once */
+		wed_w32(dev, MTK_WED_EXT_INT_MASK1,
+			MTK_WED_EXT_INT_STATUS_WPDMA_MID_RDY);
+		wed_w32(dev, MTK_WED_EXT_INT_MASK2,
+			MTK_WED_EXT_INT_STATUS_WPDMA_MID_RDY);
+
+		wed_r32(dev, MTK_WED_EXT_INT_MASK1);
+		wed_r32(dev, MTK_WED_EXT_INT_MASK2);
+
+		if (mtk_wed_rro_cfg(dev))
+			return;
+
+		mtk_wed_set_512_support(dev, dev->wlan.wcid_512);
 	}
 
 	mtk_wed_dma_enable(dev);
@@ -856,7 +1324,7 @@ mtk_wed_attach(struct mtk_wed_device *dev)
 	if (!hw) {
 		module_put(THIS_MODULE);
 		ret = -ENODEV;
-		goto out;
+		goto unlock;
 	}
 
 	device = dev->wlan.bus_type == MTK_WED_BUS_PCIE
@@ -869,15 +1337,24 @@ mtk_wed_attach(struct mtk_wed_device *dev)
 	dev->dev = hw->dev;
 	dev->irq = hw->irq;
 	dev->wdma_idx = hw->index;
+	dev->version = hw->version;
 
 	if (hw->eth->dma_dev == hw->eth->dev &&
 	    of_dma_is_coherent(hw->eth->dev->of_node))
 		mtk_eth_set_dma_device(hw->eth, hw->dev);
 
-	ret = mtk_wed_buffer_alloc(dev);
-	if (ret) {
-		mtk_wed_detach(dev);
+	ret = mtk_wed_tx_buffer_alloc(dev);
+	if (ret)
 		goto out;
+
+	if (mtk_wed_get_rx_capa(dev)) {
+		ret = mtk_wed_rx_buffer_alloc(dev);
+		if (ret)
+			goto out;
+
+		ret = mtk_wed_rro_alloc(dev);
+		if (ret)
+			goto out;
 	}
 
 	mtk_wed_hw_init_early(dev);
@@ -886,8 +1363,10 @@ mtk_wed_attach(struct mtk_wed_device *dev)
 				   BIT(hw->index), 0);
 	else
 		ret = mtk_wed_wo_init(hw);
-
 out:
+	if (ret)
+		mtk_wed_detach(dev);
+unlock:
 	mutex_unlock(&hw_lock);
 
 	return ret;
@@ -910,10 +1389,11 @@ mtk_wed_tx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs)
 	 * WDMA RX.
 	 */
 
-	BUG_ON(idx >= ARRAY_SIZE(dev->tx_ring));
+	if (WARN_ON(idx >= ARRAY_SIZE(dev->tx_ring)))
+		return -EINVAL;
 
 	if (mtk_wed_ring_alloc(dev, ring, MTK_WED_TX_RING_SIZE,
-			       sizeof(*ring->desc)))
+			       sizeof(*ring->desc), true))
 		return -ENOMEM;
 
 	if (mtk_wed_wdma_rx_ring_setup(dev, idx, MTK_WED_WDMA_RING_SIZE))
@@ -960,6 +1440,37 @@ mtk_wed_txfree_ring_setup(struct mtk_wed_device *dev, void __iomem *regs)
 	return 0;
 }
 
+static int
+mtk_wed_rx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs)
+{
+	struct mtk_wed_ring *ring = &dev->rx_ring[idx];
+
+	if (WARN_ON(idx >= ARRAY_SIZE(dev->rx_ring)))
+		return -EINVAL;
+
+	if (mtk_wed_ring_alloc(dev, ring, MTK_WED_RX_RING_SIZE,
+			       sizeof(*ring->desc), false))
+		return -ENOMEM;
+
+	if (mtk_wed_wdma_tx_ring_setup(dev, idx, MTK_WED_WDMA_RING_SIZE))
+		return -ENOMEM;
+
+	ring->reg_base = MTK_WED_RING_RX_DATA(idx);
+	ring->wpdma = regs;
+	ring->flags |= MTK_WED_RING_CONFIGURED;
+
+	/* WPDMA ->  WED */
+	wpdma_rx_w32(dev, idx, MTK_WED_RING_OFS_BASE, ring->desc_phys);
+	wpdma_rx_w32(dev, idx, MTK_WED_RING_OFS_COUNT, MTK_WED_RX_RING_SIZE);
+
+	wed_w32(dev, MTK_WED_WPDMA_RING_RX_DATA(idx) + MTK_WED_RING_OFS_BASE,
+		ring->desc_phys);
+	wed_w32(dev, MTK_WED_WPDMA_RING_RX_DATA(idx) + MTK_WED_RING_OFS_COUNT,
+		MTK_WED_RX_RING_SIZE);
+
+	return 0;
+}
+
 static u32
 mtk_wed_irq_get(struct mtk_wed_device *dev, u32 mask)
 {
@@ -1056,7 +1567,9 @@ void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 	static const struct mtk_wed_ops wed_ops = {
 		.attach = mtk_wed_attach,
 		.tx_ring_setup = mtk_wed_tx_ring_setup,
+		.rx_ring_setup = mtk_wed_rx_ring_setup,
 		.txfree_ring_setup = mtk_wed_txfree_ring_setup,
+		.msg_update = mtk_wed_mcu_msg_update,
 		.start = mtk_wed_start,
 		.stop = mtk_wed_stop,
 		.reset_dma = mtk_wed_reset_dma,
@@ -1065,6 +1578,7 @@ void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 		.irq_get = mtk_wed_irq_get,
 		.irq_set_mask = mtk_wed_irq_set_mask,
 		.detach = mtk_wed_detach,
+		.ppe_check = mtk_wed_ppe_check,
 	};
 	struct device_node *eth_np = eth->dev->of_node;
 	struct platform_device *pdev;
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.h b/drivers/net/ethernet/mediatek/mtk_wed.h
index af656fd31ff9..e012b8a82133 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed.h
@@ -86,6 +86,24 @@ wpdma_tx_w32(struct mtk_wed_device *dev, int ring, u32 reg, u32 val)
 	writel(val, dev->tx_ring[ring].wpdma + reg);
 }
 
+static inline u32
+wpdma_rx_r32(struct mtk_wed_device *dev, int ring, u32 reg)
+{
+	if (!dev->rx_ring[ring].wpdma)
+		return 0;
+
+	return readl(dev->rx_ring[ring].wpdma + reg);
+}
+
+static inline void
+wpdma_rx_w32(struct mtk_wed_device *dev, int ring, u32 reg, u32 val)
+{
+	if (!dev->rx_ring[ring].wpdma)
+		return;
+
+	writel(val, dev->rx_ring[ring].wpdma + reg);
+}
+
 static inline u32
 wpdma_txfree_r32(struct mtk_wed_device *dev, u32 reg)
 {
@@ -128,6 +146,7 @@ static inline int mtk_wed_flow_add(int index)
 static inline void mtk_wed_flow_remove(int index)
 {
 }
+
 #endif
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_mcu.c b/drivers/net/ethernet/mediatek/mtk_wed_mcu.c
index b657328a1d9d..f9539e6233c9 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_mcu.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_mcu.c
@@ -10,6 +10,7 @@
 #include <linux/of_reserved_mem.h>
 #include <linux/mfd/syscon.h>
 #include <linux/soc/mediatek/mtk_wed.h>
+#include <asm/unaligned.h>
 
 #include "mtk_wed_regs.h"
 #include "mtk_wed_wo.h"
@@ -60,24 +61,37 @@ void mtk_wed_mcu_rx_event(struct mtk_wed_wo *wo, struct sk_buff *skb)
 	wake_up(&wo->mcu.wait);
 }
 
+static void
+mtk_wed_update_rx_stats(struct mtk_wed_device *wed, struct sk_buff *skb)
+{
+	u32 count = get_unaligned_le32(skb->data);
+	struct mtk_wed_wo_rx_stats *stats;
+	int i;
+
+	if (count * sizeof(*stats) > skb->len - sizeof(u32))
+		return;
+
+	stats = (struct mtk_wed_wo_rx_stats *)(skb->data + sizeof(u32));
+	for (i = 0 ; i < count ; i++)
+		wed->wlan.update_wo_rx_stats(wed, &stats[i]);
+}
+
 void mtk_wed_mcu_rx_unsolicited_event(struct mtk_wed_wo *wo,
 				      struct sk_buff *skb)
 {
 	struct mtk_wed_mcu_hdr *hdr = (struct mtk_wed_mcu_hdr *)skb->data;
 
-	switch (hdr->cmd) {
-	case MTK_WED_WO_EVT_LOG_DUMP: {
-		const char *msg = (const char *)(skb->data + sizeof(*hdr));
+	skb_pull(skb, sizeof(*hdr));
 
-		dev_notice(wo->hw->dev, "%s\n", msg);
+	switch (hdr->cmd) {
+	case MTK_WED_WO_EVT_LOG_DUMP:
+		dev_notice(wo->hw->dev, "%s\n", skb->data);
 		break;
-	}
 	case MTK_WED_WO_EVT_PROFILING: {
-		struct mtk_wed_wo_log_info *info;
-		u32 count = (skb->len - sizeof(*hdr)) / sizeof(*info);
+		struct mtk_wed_wo_log_info *info = (void *)skb->data;
+		u32 count = skb->len / sizeof(*info);
 		int i;
 
-		info = (struct mtk_wed_wo_log_info *)(skb->data + sizeof(*hdr));
 		for (i = 0 ; i < count ; i++)
 			dev_notice(wo->hw->dev,
 				   "SN:%u latency: total=%u, rro:%u, mod:%u\n",
@@ -88,6 +102,7 @@ void mtk_wed_mcu_rx_unsolicited_event(struct mtk_wed_wo *wo,
 		break;
 	}
 	case MTK_WED_WO_EVT_RXCNT_INFO:
+		mtk_wed_update_rx_stats(wo->hw->wed_dev, skb);
 		break;
 	default:
 		break;
@@ -144,6 +159,8 @@ mtk_wed_mcu_parse_response(struct mtk_wed_wo *wo, struct sk_buff *skb,
 	skb_pull(skb, sizeof(*hdr));
 	switch (cmd) {
 	case MTK_WED_WO_CMD_RXCNT_INFO:
+		mtk_wed_update_rx_stats(wo->hw->wed_dev, skb);
+		break;
 	default:
 		break;
 	}
@@ -182,6 +199,18 @@ unlock:
 	return ret;
 }
 
+int mtk_wed_mcu_msg_update(struct mtk_wed_device *dev, int id, void *data,
+			   int len)
+{
+	struct mtk_wed_wo *wo = dev->hw->wed_wo;
+
+	if (dev->hw->version == 1)
+		return 0;
+
+	return mtk_wed_mcu_send_msg(wo, MTK_WED_MODULE_ID_WO, id, data, len,
+				    true);
+}
+
 static int
 mtk_wed_get_memory_region(struct mtk_wed_wo *wo,
 			  struct mtk_wed_wo_memory_region *region)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_regs.h b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
index c940b3bb215b..9e39dace95eb 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_regs.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
@@ -4,6 +4,7 @@
 #ifndef __MTK_WED_REGS_H
 #define __MTK_WED_REGS_H
 
+#define MTK_WFDMA_DESC_CTRL_TO_HOST		BIT(8)
 #define MTK_WDMA_DESC_CTRL_LEN1			GENMASK(14, 0)
 #define MTK_WDMA_DESC_CTRL_LEN1_V2		GENMASK(13, 0)
 #define MTK_WDMA_DESC_CTRL_LAST_SEG1		BIT(15)
@@ -28,6 +29,8 @@ struct mtk_wdma_desc {
 #define MTK_WED_RESET_WED_TX_DMA			BIT(12)
 #define MTK_WED_RESET_WDMA_RX_DRV			BIT(17)
 #define MTK_WED_RESET_WDMA_INT_AGENT			BIT(19)
+#define MTK_WED_RESET_RX_RRO_QM				BIT(20)
+#define MTK_WED_RESET_RX_ROUTE_QM			BIT(21)
 #define MTK_WED_RESET_WED				BIT(31)
 
 #define MTK_WED_CTRL					0x00c
@@ -39,8 +42,12 @@ struct mtk_wdma_desc {
 #define MTK_WED_CTRL_WED_TX_BM_BUSY			BIT(9)
 #define MTK_WED_CTRL_WED_TX_FREE_AGENT_EN		BIT(10)
 #define MTK_WED_CTRL_WED_TX_FREE_AGENT_BUSY		BIT(11)
-#define MTK_WED_CTRL_RESERVE_EN				BIT(12)
-#define MTK_WED_CTRL_RESERVE_BUSY			BIT(13)
+#define MTK_WED_CTRL_WED_RX_BM_EN			BIT(12)
+#define MTK_WED_CTRL_WED_RX_BM_BUSY			BIT(13)
+#define MTK_WED_CTRL_RX_RRO_QM_EN			BIT(14)
+#define MTK_WED_CTRL_RX_RRO_QM_BUSY			BIT(15)
+#define MTK_WED_CTRL_RX_ROUTE_QM_EN			BIT(16)
+#define MTK_WED_CTRL_RX_ROUTE_QM_BUSY			BIT(17)
 #define MTK_WED_CTRL_FINAL_DIDX_READ			BIT(24)
 #define MTK_WED_CTRL_ETH_DMAD_FMT			BIT(25)
 #define MTK_WED_CTRL_MIB_READ_CLEAR			BIT(28)
@@ -62,6 +69,9 @@ struct mtk_wdma_desc {
 #define MTK_WED_EXT_INT_STATUS_TX_DMA_R_RESP_ERR	BIT(22)
 #define MTK_WED_EXT_INT_STATUS_TX_DMA_W_RESP_ERR	BIT(23)
 #define MTK_WED_EXT_INT_STATUS_RX_DRV_DMA_RECYCLE	BIT(24)
+#define MTK_WED_EXT_INT_STATUS_RX_DRV_GET_BM_DMAD_SKIP	BIT(25)
+#define MTK_WED_EXT_INT_STATUS_WPDMA_RX_D_DRV_ERR	BIT(26)
+#define MTK_WED_EXT_INT_STATUS_WPDMA_MID_RDY		BIT(27)
 #define MTK_WED_EXT_INT_STATUS_ERROR_MASK		(MTK_WED_EXT_INT_STATUS_TF_LEN_ERR | \
 							 MTK_WED_EXT_INT_STATUS_TKID_WO_PYLD | \
 							 MTK_WED_EXT_INT_STATUS_TKID_TITO_INVALID | \
@@ -71,6 +81,8 @@ struct mtk_wdma_desc {
 							 MTK_WED_EXT_INT_STATUS_TX_DMA_R_RESP_ERR)
 
 #define MTK_WED_EXT_INT_MASK				0x028
+#define MTK_WED_EXT_INT_MASK1				0x02c
+#define MTK_WED_EXT_INT_MASK2				0x030
 
 #define MTK_WED_STATUS					0x060
 #define MTK_WED_STATUS_TX				GENMASK(15, 8)
@@ -151,6 +163,7 @@ struct mtk_wdma_desc {
 #define MTK_WED_RING_TX(_n)				(0x300 + (_n) * 0x10)
 
 #define MTK_WED_RING_RX(_n)				(0x400 + (_n) * 0x10)
+#define MTK_WED_RING_RX_DATA(_n)			(0x420 + (_n) * 0x10)
 
 #define MTK_WED_SCR0					0x3c0
 #define MTK_WED_WPDMA_INT_TRIGGER			0x504
@@ -213,6 +226,12 @@ struct mtk_wdma_desc {
 #define MTK_WED_WPDMA_INT_CTRL_TX1_DONE_TRIG		GENMASK(14, 10)
 
 #define MTK_WED_WPDMA_INT_CTRL_RX			0x534
+#define MTK_WED_WPDMA_INT_CTRL_RX0_EN			BIT(0)
+#define MTK_WED_WPDMA_INT_CTRL_RX0_CLR			BIT(1)
+#define MTK_WED_WPDMA_INT_CTRL_RX0_DONE_TRIG		GENMASK(6, 2)
+#define MTK_WED_WPDMA_INT_CTRL_RX1_EN			BIT(8)
+#define MTK_WED_WPDMA_INT_CTRL_RX1_CLR			BIT(9)
+#define MTK_WED_WPDMA_INT_CTRL_RX1_DONE_TRIG		GENMASK(14, 10)
 
 #define MTK_WED_WPDMA_INT_CTRL_TX_FREE			0x538
 #define MTK_WED_WPDMA_INT_CTRL_TX_FREE_DONE_EN		BIT(0)
@@ -242,11 +261,34 @@ struct mtk_wdma_desc {
 
 #define MTK_WED_WPDMA_RING_TX(_n)			(0x600 + (_n) * 0x10)
 #define MTK_WED_WPDMA_RING_RX(_n)			(0x700 + (_n) * 0x10)
+#define MTK_WED_WPDMA_RING_RX_DATA(_n)			(0x730 + (_n) * 0x10)
+
+#define MTK_WED_WPDMA_RX_D_GLO_CFG			0x75c
+#define MTK_WED_WPDMA_RX_D_RX_DRV_EN			BIT(0)
+#define MTK_WED_WPDMA_RX_D_INIT_PHASE_RXEN_SEL		GENMASK(11, 7)
+#define MTK_WED_WPDMA_RX_D_RXD_READ_LEN			GENMASK(31, 24)
+
+#define MTK_WED_WPDMA_RX_D_RST_IDX			0x760
+#define MTK_WED_WPDMA_RX_D_RST_CRX_IDX			GENMASK(17, 16)
+#define MTK_WED_WPDMA_RX_D_RST_DRV_IDX			GENMASK(25, 24)
+
+#define MTK_WED_WPDMA_RX_GLO_CFG			0x76c
+#define MTK_WED_WPDMA_RX_RING				0x770
+
+#define MTK_WED_WPDMA_RX_D_MIB(_n)			(0x774 + (_n) * 4)
+#define MTK_WED_WPDMA_RX_D_PROCESSED_MIB(_n)		(0x784 + (_n) * 4)
+#define MTK_WED_WPDMA_RX_D_COHERENT_MIB			0x78c
+
+#define MTK_WED_WDMA_RING_TX				0x800
+
+#define MTK_WED_WDMA_TX_MIB				0x810
+
 #define MTK_WED_WDMA_RING_RX(_n)			(0x900 + (_n) * 0x10)
 #define MTK_WED_WDMA_RX_THRES(_n)			(0x940 + (_n) * 0x4)
 
 #define MTK_WED_WDMA_GLO_CFG				0xa04
 #define MTK_WED_WDMA_GLO_CFG_TX_DRV_EN			BIT(0)
+#define MTK_WED_WDMA_GLO_CFG_TX_DDONE_CHK		BIT(1)
 #define MTK_WED_WDMA_GLO_CFG_RX_DRV_EN			BIT(2)
 #define MTK_WED_WDMA_GLO_CFG_RX_DRV_BUSY		BIT(3)
 #define MTK_WED_WDMA_GLO_CFG_BT_SIZE			GENMASK(5, 4)
@@ -291,6 +333,20 @@ struct mtk_wdma_desc {
 #define MTK_WED_WDMA_RX_RECYCLE_MIB(_n)			(0xae8 + (_n) * 4)
 #define MTK_WED_WDMA_RX_PROCESSED_MIB(_n)		(0xaf0 + (_n) * 4)
 
+#define MTK_WED_RX_BM_RX_DMAD				0xd80
+#define MTK_WED_RX_BM_RX_DMAD_SDL0			GENMASK(13, 0)
+
+#define MTK_WED_RX_BM_BASE				0xd84
+#define MTK_WED_RX_BM_INIT_PTR				0xd88
+#define MTK_WED_RX_BM_SW_TAIL				GENMASK(15, 0)
+#define MTK_WED_RX_BM_INIT_SW_TAIL			BIT(16)
+
+#define MTK_WED_RX_PTR					0xd8c
+
+#define MTK_WED_RX_BM_DYN_ALLOC_TH			0xdb4
+#define MTK_WED_RX_BM_DYN_ALLOC_TH_H			GENMASK(31, 16)
+#define MTK_WED_RX_BM_DYN_ALLOC_TH_L			GENMASK(15, 0)
+
 #define MTK_WED_RING_OFS_BASE				0x00
 #define MTK_WED_RING_OFS_COUNT				0x04
 #define MTK_WED_RING_OFS_CPU_IDX			0x08
@@ -301,7 +357,9 @@ struct mtk_wdma_desc {
 
 #define MTK_WDMA_GLO_CFG				0x204
 #define MTK_WDMA_GLO_CFG_TX_DMA_EN			BIT(0)
+#define MTK_WDMA_GLO_CFG_TX_DMA_BUSY			BIT(1)
 #define MTK_WDMA_GLO_CFG_RX_DMA_EN			BIT(2)
+#define MTK_WDMA_GLO_CFG_RX_DMA_BUSY			BIT(3)
 #define MTK_WDMA_GLO_CFG_RX_INFO3_PRERES		BIT(26)
 #define MTK_WDMA_GLO_CFG_RX_INFO2_PRERES		BIT(27)
 #define MTK_WDMA_GLO_CFG_RX_INFO1_PRERES		BIT(28)
@@ -330,4 +388,70 @@ struct mtk_wdma_desc {
 /* DMA channel mapping */
 #define HIFSYS_DMA_AG_MAP				0x008
 
+#define MTK_WED_RTQM_GLO_CFG				0xb00
+#define MTK_WED_RTQM_BUSY				BIT(1)
+#define MTK_WED_RTQM_Q_RST				BIT(2)
+#define MTK_WED_RTQM_Q_DBG_BYPASS			BIT(5)
+#define MTK_WED_RTQM_TXDMAD_FPORT			GENMASK(23, 20)
+
+#define MTK_WED_RTQM_R2H_MIB(_n)			(0xb70 + (_n) * 0x4)
+#define MTK_WED_RTQM_R2Q_MIB(_n)			(0xb78 + (_n) * 0x4)
+#define MTK_WED_RTQM_Q2N_MIB				0xb80
+#define MTK_WED_RTQM_Q2H_MIB(_n)			(0xb84 + (_n) * 0x4)
+
+#define MTK_WED_RTQM_Q2B_MIB				0xb8c
+#define MTK_WED_RTQM_PFDBK_MIB				0xb90
+
+#define MTK_WED_RROQM_GLO_CFG				0xc04
+#define MTK_WED_RROQM_RST_IDX				0xc08
+#define MTK_WED_RROQM_RST_IDX_MIOD			BIT(0)
+#define MTK_WED_RROQM_RST_IDX_FDBK			BIT(4)
+
+#define MTK_WED_RROQM_MIOD_CTRL0			0xc40
+#define MTK_WED_RROQM_MIOD_CTRL1			0xc44
+#define MTK_WED_RROQM_MIOD_CNT				GENMASK(11, 0)
+
+#define MTK_WED_RROQM_MIOD_CTRL2			0xc48
+#define MTK_WED_RROQM_MIOD_CTRL3			0xc4c
+
+#define MTK_WED_RROQM_FDBK_CTRL0			0xc50
+#define MTK_WED_RROQM_FDBK_CTRL1			0xc54
+#define MTK_WED_RROQM_FDBK_CNT				GENMASK(11, 0)
+
+#define MTK_WED_RROQM_FDBK_CTRL2			0xc58
+
+#define MTK_WED_RROQ_BASE_L				0xc80
+#define MTK_WED_RROQ_BASE_H				0xc84
+
+#define MTK_WED_RROQM_MIOD_CFG				0xc8c
+#define MTK_WED_RROQM_MIOD_MID_DW			GENMASK(5, 0)
+#define MTK_WED_RROQM_MIOD_MOD_DW			GENMASK(13, 8)
+#define MTK_WED_RROQM_MIOD_ENTRY_DW			GENMASK(22, 16)
+
+#define MTK_WED_RROQM_MID_MIB				0xcc0
+#define MTK_WED_RROQM_MOD_MIB				0xcc4
+#define MTK_WED_RROQM_MOD_COHERENT_MIB			0xcc8
+#define MTK_WED_RROQM_FDBK_MIB				0xcd0
+#define MTK_WED_RROQM_FDBK_COHERENT_MIB			0xcd4
+#define MTK_WED_RROQM_FDBK_IND_MIB			0xce0
+#define MTK_WED_RROQM_FDBK_ENQ_MIB			0xce4
+#define MTK_WED_RROQM_FDBK_ANC_MIB			0xce8
+#define MTK_WED_RROQM_FDBK_ANC2H_MIB			0xcec
+
+#define MTK_WED_RX_BM_RX_DMAD				0xd80
+#define MTK_WED_RX_BM_BASE				0xd84
+#define MTK_WED_RX_BM_INIT_PTR				0xd88
+#define MTK_WED_RX_BM_PTR				0xd8c
+#define MTK_WED_RX_BM_PTR_HEAD				GENMASK(32, 16)
+#define MTK_WED_RX_BM_PTR_TAIL				GENMASK(15, 0)
+
+#define MTK_WED_RX_BM_BLEN				0xd90
+#define MTK_WED_RX_BM_STS				0xd94
+#define MTK_WED_RX_BM_INTF2				0xd98
+#define MTK_WED_RX_BM_INTF				0xd9c
+#define MTK_WED_RX_BM_ERR_STS				0xda8
+
+#define MTK_WED_WOCPU_VIEW_MIOD_BASE			0x8000
+#define MTK_WED_PCIE_INT_MASK				0x0
+
 #endif
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.h b/drivers/net/ethernet/mediatek/mtk_wed_wo.h
index ed7e69d420dc..c8fb85795864 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.h
@@ -49,6 +49,10 @@ enum {
 	MTK_WED_WARP_CMD_FLAG_FROM_TO_WO	= BIT(2),
 };
 
+#define MTK_WED_WO_CPU_MCUSYS_RESET_ADDR	0x15194050
+#define MTK_WED_WO_CPU_WO0_MCUSYS_RESET_MASK	0x20
+#define MTK_WED_WO_CPU_WO1_MCUSYS_RESET_MASK	0x1
+
 enum {
 	MTK_WED_WO_REGION_EMI,
 	MTK_WED_WO_REGION_ILM,
@@ -57,6 +61,28 @@ enum {
 	__MTK_WED_WO_REGION_MAX,
 };
 
+enum mtk_wed_wo_state {
+	MTK_WED_WO_STATE_UNDEFINED,
+	MTK_WED_WO_STATE_INIT,
+	MTK_WED_WO_STATE_ENABLE,
+	MTK_WED_WO_STATE_DISABLE,
+	MTK_WED_WO_STATE_HALT,
+	MTK_WED_WO_STATE_GATING,
+	MTK_WED_WO_STATE_SER_RESET,
+	MTK_WED_WO_STATE_WF_RESET,
+};
+
+enum mtk_wed_wo_done_state {
+	MTK_WED_WOIF_UNDEFINED,
+	MTK_WED_WOIF_DISABLE_DONE,
+	MTK_WED_WOIF_TRIGGER_ENABLE,
+	MTK_WED_WOIF_ENABLE_DONE,
+	MTK_WED_WOIF_TRIGGER_GATING,
+	MTK_WED_WOIF_GATING_DONE,
+	MTK_WED_WOIF_TRIGGER_HALT,
+	MTK_WED_WOIF_HALT_DONE,
+};
+
 enum mtk_wed_dummy_cr_idx {
 	MTK_WED_DUMMY_CR_FWDL,
 	MTK_WED_DUMMY_CR_WO_STATUS,
@@ -245,6 +271,8 @@ void mtk_wed_mcu_rx_unsolicited_event(struct mtk_wed_wo *wo,
 				      struct sk_buff *skb);
 int mtk_wed_mcu_send_msg(struct mtk_wed_wo *wo, int id, int cmd,
 			 const void *data, int len, bool wait_resp);
+int mtk_wed_mcu_msg_update(struct mtk_wed_device *dev, int id, void *data,
+			   int len);
 int mtk_wed_mcu_init(struct mtk_wed_wo *wo);
 int mtk_wed_wo_init(struct mtk_wed_hw *hw);
 void mtk_wed_wo_deinit(struct mtk_wed_hw *hw);
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 956978320f8b..8294978f4bca 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -5,10 +5,13 @@
 #include <linux/rcupdate.h>
 #include <linux/regmap.h>
 #include <linux/pci.h>
+#include <linux/skbuff.h>
 
 #define MTK_WED_TX_QUEUES		2
 #define MTK_WED_RX_QUEUES		2
 
+#define WED_WO_STA_REC			0x6
+
 struct mtk_wed_hw;
 struct mtk_wdma_desc;
 
@@ -41,21 +44,37 @@ enum mtk_wed_wo_cmd {
 	MTK_WED_WO_CMD_WED_END
 };
 
+struct mtk_rxbm_desc {
+	__le32 buf0;
+	__le32 token;
+} __packed __aligned(4);
+
 enum mtk_wed_bus_tye {
 	MTK_WED_BUS_PCIE,
 	MTK_WED_BUS_AXI,
 };
 
+#define MTK_WED_RING_CONFIGURED		BIT(0)
 struct mtk_wed_ring {
 	struct mtk_wdma_desc *desc;
 	dma_addr_t desc_phys;
 	u32 desc_size;
 	int size;
+	u32 flags;
 
 	u32 reg_base;
 	void __iomem *wpdma;
 };
 
+struct mtk_wed_wo_rx_stats {
+	__le16 wlan_idx;
+	__le16 tid;
+	__le32 rx_pkt_cnt;
+	__le32 rx_byte_cnt;
+	__le32 rx_err_cnt;
+	__le32 rx_drop_cnt;
+};
+
 struct mtk_wed_device {
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
 	const struct mtk_wed_ops *ops;
@@ -64,9 +83,12 @@ struct mtk_wed_device {
 	bool init_done, running;
 	int wdma_idx;
 	int irq;
+	u8 version;
 
 	struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES];
+	struct mtk_wed_ring rx_ring[MTK_WED_RX_QUEUES];
 	struct mtk_wed_ring txfree_ring;
+	struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES];
 	struct mtk_wed_ring rx_wdma[MTK_WED_RX_QUEUES];
 
 	struct {
@@ -74,7 +96,20 @@ struct mtk_wed_device {
 		void **pages;
 		struct mtk_wdma_desc *desc;
 		dma_addr_t desc_phys;
-	} buf_ring;
+	} tx_buf_ring;
+
+	struct {
+		int size;
+		struct page_frag_cache rx_page;
+		struct mtk_rxbm_desc *desc;
+		dma_addr_t desc_phys;
+	} rx_buf_ring;
+
+	struct {
+		struct mtk_wed_ring ring;
+		dma_addr_t miod_phys;
+		dma_addr_t fdbk_phys;
+	} rro;
 
 	/* filled by driver: */
 	struct {
@@ -83,22 +118,36 @@ struct mtk_wed_device {
 			struct pci_dev *pci_dev;
 		};
 		enum mtk_wed_bus_tye bus_type;
+		void __iomem *base;
+		u32 phy_base;
 
 		u32 wpdma_phys;
 		u32 wpdma_int;
 		u32 wpdma_mask;
 		u32 wpdma_tx;
 		u32 wpdma_txfree;
+		u32 wpdma_rx_glo;
+		u32 wpdma_rx;
+
+		bool wcid_512;
 
 		u16 token_start;
 		unsigned int nbuf;
+		unsigned int rx_nbuf;
+		unsigned int rx_npkt;
+		unsigned int rx_size;
 
 		u8 tx_tbit[MTK_WED_TX_QUEUES];
+		u8 rx_tbit[MTK_WED_RX_QUEUES];
 		u8 txfree_tbit;
 
 		u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id);
 		int (*offload_enable)(struct mtk_wed_device *wed);
 		void (*offload_disable)(struct mtk_wed_device *wed);
+		u32 (*init_rx_buf)(struct mtk_wed_device *wed, int size);
+		void (*release_rx_buf)(struct mtk_wed_device *wed);
+		void (*update_wo_rx_stats)(struct mtk_wed_device *wed,
+					   struct mtk_wed_wo_rx_stats *stats);
 	} wlan;
 #endif
 };
@@ -107,9 +156,15 @@ struct mtk_wed_ops {
 	int (*attach)(struct mtk_wed_device *dev);
 	int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring,
 			     void __iomem *regs);
+	int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring,
+			     void __iomem *regs);
 	int (*txfree_ring_setup)(struct mtk_wed_device *dev,
 				 void __iomem *regs);
+	int (*msg_update)(struct mtk_wed_device *dev, int cmd_id,
+			  void *data, int len);
 	void (*detach)(struct mtk_wed_device *dev);
+	void (*ppe_check)(struct mtk_wed_device *dev, struct sk_buff *skb,
+			  u32 reason, u32 hash);
 
 	void (*stop)(struct mtk_wed_device *dev);
 	void (*start)(struct mtk_wed_device *dev, u32 irq_mask);
@@ -144,6 +199,16 @@ mtk_wed_device_attach(struct mtk_wed_device *dev)
 	return ret;
 }
 
+static inline bool
+mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
+{
+#ifdef CONFIG_NET_MEDIATEK_SOC_WED
+	return dev->version != 1;
+#else
+	return false;
+#endif
+}
+
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
 #define mtk_wed_device_active(_dev) !!(_dev)->ops
 #define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev)
@@ -160,6 +225,12 @@ mtk_wed_device_attach(struct mtk_wed_device *dev)
 	(_dev)->ops->irq_get(_dev, _mask)
 #define mtk_wed_device_irq_set_mask(_dev, _mask) \
 	(_dev)->ops->irq_set_mask(_dev, _mask)
+#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) \
+	(_dev)->ops->rx_ring_setup(_dev, _ring, _regs)
+#define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) \
+	(_dev)->ops->ppe_check(_dev, _skb, _reason, _hash)
+#define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \
+	(_dev)->ops->msg_update(_dev, _id, _msg, _len)
 #else
 static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 {
@@ -173,6 +244,9 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0)
 #define mtk_wed_device_irq_get(_dev, _mask) 0
 #define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0)
+#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV
+#define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash)  do {} while (0)
+#define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV
 #endif
 
 #endif
-- 
cgit v1.2.3


From 75ab70ec5cefade915a999886e4863fe6a38f8b5 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Wed, 9 Nov 2022 15:09:45 -0800
Subject: ptp: remove the .adjfreq interface function

Now that all drivers have been converted to .adjfine, we can remove the
.adjfreq from the interface structure.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c          | 5 +----
 include/linux/ptp_clock_kernel.h | 7 -------
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 51cae72bb6db..62d4d29e7c05 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -131,10 +131,7 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct __kernel_timex *tx)
 		long ppb = scaled_ppm_to_ppb(tx->freq);
 		if (ppb > ops->max_adj || ppb < -ops->max_adj)
 			return -ERANGE;
-		if (ops->adjfine)
-			err = ops->adjfine(ops, tx->freq);
-		else
-			err = ops->adjfreq(ops, ppb);
+		err = ops->adjfine(ops, tx->freq);
 		ptp->dialed_frequency = tx->freq;
 	} else if (tx->modes & ADJ_OFFSET) {
 		if (ops->adjphase) {
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index f4781c5766d6..fdffa6a98d79 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -77,12 +77,6 @@ struct ptp_system_timestamp {
  *            nominal frequency in parts per million, but with a
  *            16 bit binary fractional field.
  *
- * @adjfreq:  Adjusts the frequency of the hardware clock.
- *            This method is deprecated.  New drivers should implement
- *            the @adjfine method instead.
- *            parameter delta: Desired frequency offset from nominal frequency
- *            in parts per billion
- *
  * @adjphase:  Adjusts the phase offset of the hardware clock.
  *             parameter delta: Desired change in nanoseconds.
  *
@@ -174,7 +168,6 @@ struct ptp_clock_info {
 	int pps;
 	struct ptp_pin_desc *pin_config;
 	int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm);
-	int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta);
 	int (*adjphase)(struct ptp_clock_info *ptp, s32 phase);
 	int (*adjtime)(struct ptp_clock_info *ptp, s64 delta);
 	int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts);
-- 
cgit v1.2.3


From 354259fa73e2aac92ae5e19522adb69a92c15b49 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Nov 2022 09:57:58 +0000
Subject: net: remove skb->vlan_present

skb->vlan_present seems redundant.

We can instead derive it from this boolean expression:

vlan_present = skb->vlan_proto != 0 || skb->vlan_tci != 0

Add a new union, to access both fields in a single load/store
when possible.

	union {
		u32	vlan_all;
		struct {
		__be16	vlan_proto;
		__u16	vlan_tci;
		};
	};

This allows following patch to remove a conditional test in GRO stack.

Note:
  We move remcsum_offload to keep TC_AT_INGRESS_MASK
  and SKB_MONO_DELIVERY_TIME_MASK unchanged.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 arch/sparc/net/bpf_jit_comp_32.c                   | 10 +++++-----
 .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c   |  2 +-
 include/linux/if_vlan.h                            |  9 +++------
 include/linux/skbuff.h                             | 18 ++++++++++--------
 lib/test_bpf.c                                     |  1 -
 net/core/filter.c                                  | 22 ++++++++++------------
 6 files changed, 29 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c
index b1dbf2fa8c0a..a74e5004c6c8 100644
--- a/arch/sparc/net/bpf_jit_comp_32.c
+++ b/arch/sparc/net/bpf_jit_comp_32.c
@@ -555,11 +555,11 @@ void bpf_jit_compile(struct bpf_prog *fp)
 				emit_skb_load16(vlan_tci, r_A);
 				break;
 			case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT:
-				__emit_skb_load8(__pkt_vlan_present_offset, r_A);
-				if (PKT_VLAN_PRESENT_BIT)
-					emit_alu_K(SRL, PKT_VLAN_PRESENT_BIT);
-				if (PKT_VLAN_PRESENT_BIT < 7)
-					emit_andi(r_A, 1, r_A);
+				emit_skb_load32(vlan_all, r_A);
+				emit_cmpi(r_A, 0);
+				emit_branch_off(BE, 12);
+				emit_nop();
+				emit_loadimm(1, r_A);
 				break;
 			case BPF_LD | BPF_W | BPF_LEN:
 				emit_skb_load32(len, r_A);
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
index 303930499a4c..c1ea60bc2630 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
@@ -1973,7 +1973,7 @@ static u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
 #endif
 
 #ifdef CONFIG_DCB
-	if (!skb->vlan_present)
+	if (!skb_vlan_tag_present(skb))
 		goto pick_tx;
 
 	vlan_prio = skb->vlan_tci >> 13;
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index e00c4ee81ff7..6864b89ef868 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -76,7 +76,7 @@ static inline bool is_vlan_dev(const struct net_device *dev)
         return dev->priv_flags & IFF_802_1Q_VLAN;
 }
 
-#define skb_vlan_tag_present(__skb)	((__skb)->vlan_present)
+#define skb_vlan_tag_present(__skb)	(!!(__skb)->vlan_all)
 #define skb_vlan_tag_get(__skb)		((__skb)->vlan_tci)
 #define skb_vlan_tag_get_id(__skb)	((__skb)->vlan_tci & VLAN_VID_MASK)
 #define skb_vlan_tag_get_cfi(__skb)	(!!((__skb)->vlan_tci & VLAN_CFI_MASK))
@@ -471,7 +471,7 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
  */
 static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb)
 {
-	skb->vlan_present = 0;
+	skb->vlan_all = 0;
 }
 
 /**
@@ -483,9 +483,7 @@ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb)
  */
 static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src)
 {
-	dst->vlan_present = src->vlan_present;
-	dst->vlan_proto = src->vlan_proto;
-	dst->vlan_tci = src->vlan_tci;
+	dst->vlan_all = src->vlan_all;
 }
 
 /*
@@ -519,7 +517,6 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
 {
 	skb->vlan_proto = vlan_proto;
 	skb->vlan_tci = vlan_tci;
-	skb->vlan_present = 1;
 }
 
 /**
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 59c9fd55699d..4e464a27adaf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -818,7 +818,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@mark: Generic packet mark
  *	@reserved_tailroom: (aka @mark) number of bytes of free space available
  *		at the tail of an sk_buff
- *	@vlan_present: VLAN tag is present
+ *	@vlan_all: vlan fields (proto & tci)
  *	@vlan_proto: vlan encapsulation protocol
  *	@vlan_tci: vlan tag control information
  *	@inner_protocol: Protocol (encapsulation)
@@ -951,7 +951,7 @@ struct sk_buff {
 	/* private: */
 	__u8			__pkt_vlan_present_offset[0];
 	/* public: */
-	__u8			vlan_present:1;	/* See PKT_VLAN_PRESENT_BIT */
+	__u8			remcsum_offload:1;
 	__u8			csum_complete_sw:1;
 	__u8			csum_level:2;
 	__u8			dst_pending_confirm:1;
@@ -966,7 +966,6 @@ struct sk_buff {
 
 	__u8			ipvs_property:1;
 	__u8			inner_protocol_type:1;
-	__u8			remcsum_offload:1;
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
 	__u8			offload_l3_fwd_mark:1;
@@ -999,8 +998,13 @@ struct sk_buff {
 	__u32			priority;
 	int			skb_iif;
 	__u32			hash;
-	__be16			vlan_proto;
-	__u16			vlan_tci;
+	union {
+		u32		vlan_all;
+		struct {
+			__be16	vlan_proto;
+			__u16	vlan_tci;
+		};
+	};
 #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
 	union {
 		unsigned int	napi_id;
@@ -1059,15 +1063,13 @@ struct sk_buff {
 #endif
 #define PKT_TYPE_OFFSET		offsetof(struct sk_buff, __pkt_type_offset)
 
-/* if you move pkt_vlan_present, tc_at_ingress, or mono_delivery_time
+/* if you move tc_at_ingress or mono_delivery_time
  * around, you also must adapt these constants.
  */
 #ifdef __BIG_ENDIAN_BITFIELD
-#define PKT_VLAN_PRESENT_BIT	7
 #define TC_AT_INGRESS_MASK		(1 << 0)
 #define SKB_MONO_DELIVERY_TIME_MASK	(1 << 2)
 #else
-#define PKT_VLAN_PRESENT_BIT	0
 #define TC_AT_INGRESS_MASK		(1 << 7)
 #define SKB_MONO_DELIVERY_TIME_MASK	(1 << 5)
 #endif
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 5820704165a6..ade9ac672adb 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -14346,7 +14346,6 @@ static struct sk_buff *populate_skb(char *buf, int size)
 	skb->hash = SKB_HASH;
 	skb->queue_mapping = SKB_QUEUE_MAP;
 	skb->vlan_tci = SKB_VLAN_TCI;
-	skb->vlan_present = SKB_VLAN_PRESENT;
 	skb->vlan_proto = htons(ETH_P_IP);
 	dev_net_set(&dev, &init_net);
 	skb->dev = &dev;
diff --git a/net/core/filter.c b/net/core/filter.c
index bb0136e7a8e4..358d5e70671a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -325,11 +325,11 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
 				      offsetof(struct sk_buff, vlan_tci));
 		break;
 	case SKF_AD_VLAN_TAG_PRESENT:
-		*insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET);
-		if (PKT_VLAN_PRESENT_BIT)
-			*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT);
-		if (PKT_VLAN_PRESENT_BIT < 7)
-			*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
+		BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4);
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sk_buff, vlan_all));
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
+		*insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1);
 		break;
 	}
 
@@ -9290,13 +9290,11 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 	case offsetof(struct __sk_buff, vlan_present):
-		*target_size = 1;
-		*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
-				      PKT_VLAN_PRESENT_OFFSET);
-		if (PKT_VLAN_PRESENT_BIT)
-			*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT);
-		if (PKT_VLAN_PRESENT_BIT < 7)
-			*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1);
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      bpf_target_off(struct sk_buff,
+						     vlan_all, 4, target_size));
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+		*insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1);
 		break;
 
 	case offsetof(struct __sk_buff, vlan_tci):
-- 
cgit v1.2.3


From 2c925db0a7d69b404d6bfe4c037935c2d367913d Mon Sep 17 00:00:00 2001
From: Ofer Levi <oferle@nvidia.com>
Date: Tue, 9 Feb 2021 17:48:11 +0200
Subject: net/mlx5e: Support enhanced CQE compression

CQE compression feature improves performance by reducing PCI bandwidth
bottleneck on CQEs write.
Enhanced CQE compression introduced in ConnectX-6 and it aims to reduce
CPU utilization of SW side packets decompression by eliminating the
need to rewrite ownership bit, which is likely to cost a cache-miss, is
replaced by validity byte handled solely by HW.
Another advantage of the enhanced feature is that session packets are
available to SW as soon as a single CQE slot is filled, instead of
waiting for session to close, this improves packet latency from NIC to
host.

Performance:
Following are tested scenarios and reults comparing basic and enahnced
CQE compression.

setup: IXIA 100GbE connected directly to port 0 and port 1 of
ConnectX-6 Dx 100GbE dual port.

Case #1 RX only, single flow goes to single queue:
IRQ rate reduced by ~ 30%, CPU utilization improved by 2%.

Case #2 IP forwarding from port 1 to port 0 single flow goes to
single queue:
Avg latency improved from 60us to 21us, frame loss improved from 0.5% to 0.0%.

Case #3 IP forwarding from port 1 to port 0 Max Throughput IXIA sends
100%, 8192 UDP flows, goes to 24 queues:
Enhanced is equal or slightly better than basic.

Testing the basic compression feature with this patch shows there is
no perfrormance degradation of the basic compression feature.

Signed-off-by: Ofer Levi <oferle@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   2 +
 .../net/ethernet/mellanox/mlx5/core/en/params.c    |  10 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   8 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 150 ++++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/wq.h       |  17 +++
 include/linux/mlx5/device.h                        |   6 +
 6 files changed, 170 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 26a23047f1f3..ff5b302531d5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -344,6 +344,7 @@ enum {
 	MLX5E_RQ_STATE_CSUM_FULL, /* cqe_csum_full hw bit is set */
 	MLX5E_RQ_STATE_MINI_CQE_HW_STRIDX, /* set when mini_cqe_resp_stride_index cap is used */
 	MLX5E_RQ_STATE_SHAMPO, /* set when SHAMPO cap is used */
+	MLX5E_RQ_STATE_MINI_CQE_ENHANCED,  /* set when enhanced mini_cqe_cap is used */
 };
 
 struct mlx5e_cq {
@@ -370,6 +371,7 @@ struct mlx5e_cq_decomp {
 	u8                         mini_arr_idx;
 	u16                        left;
 	u16                        wqe_counter;
+	bool                       last_cqe_title;
 } ____cacheline_aligned_in_smp;
 
 enum mlx5e_dma_map_type {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index 29dd3a04c154..1a2de9bc6538 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -608,13 +608,15 @@ void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
 		MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE :
 		MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
 
-	mlx5_core_info(mdev, "MLX5E: StrdRq(%d) RqSz(%ld) StrdSz(%ld) RxCqeCmprss(%d)\n",
+	mlx5_core_info(mdev, "MLX5E: StrdRq(%d) RqSz(%ld) StrdSz(%ld) RxCqeCmprss(%d %s)\n",
 		       params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ,
 		       params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ ?
 		       BIT(mlx5e_mpwqe_get_log_rq_size(mdev, params, NULL)) :
 		       BIT(params->log_rq_mtu_frames),
 		       BIT(mlx5e_mpwqe_get_log_stride_size(mdev, params, NULL)),
-		       MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
+		       MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS),
+		       MLX5_CAP_GEN(mdev, enhanced_cqe_compression) ?
+				       "enhanced" : "basic");
 }
 
 void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
@@ -852,6 +854,10 @@ static void mlx5e_build_rx_cq_param(struct mlx5_core_dev *mdev,
 	if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) {
 		MLX5_SET(cqc, cqc, mini_cqe_res_format, hw_stridx ?
 			 MLX5_CQE_FORMAT_CSUM_STRIDX : MLX5_CQE_FORMAT_CSUM);
+		MLX5_SET(cqc, cqc, cqe_compression_layout,
+			 MLX5_CAP_GEN(mdev, enhanced_cqe_compression) ?
+			 MLX5_CQE_COMPRESS_LAYOUT_ENHANCED :
+			 MLX5_CQE_COMPRESS_LAYOUT_BASIC);
 		MLX5_SET(cqc, cqc, cqe_comp_en, 1);
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 1669c7d7f285..c462b76743b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1205,6 +1205,13 @@ int mlx5e_open_rq(struct mlx5e_params *params, struct mlx5e_rq_param *param,
 	    MLX5_CAP_GEN(mdev, mini_cqe_resp_stride_index))
 		__set_bit(MLX5E_RQ_STATE_MINI_CQE_HW_STRIDX, &rq->state);
 
+	/* For enhanced CQE compression packet processing. decompress
+	 * session according to the enhanced layout.
+	 */
+	if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS) &&
+	    MLX5_CAP_GEN(mdev, enhanced_cqe_compression))
+		__set_bit(MLX5E_RQ_STATE_MINI_CQE_ENHANCED, &rq->state);
+
 	return 0;
 
 err_destroy_rq:
@@ -1895,6 +1902,7 @@ static int mlx5e_alloc_cq_common(struct mlx5e_priv *priv,
 		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
 
 		cqe->op_own = 0xf1;
+		cqe->validity_iteration_count = 0xff;
 	}
 
 	cq->mdev = mdev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index a61a43fc8d5c..b1ea0b995d9c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -89,6 +89,25 @@ static inline void mlx5e_read_cqe_slot(struct mlx5_cqwq *wq,
 	memcpy(data, mlx5_cqwq_get_wqe(wq, ci), sizeof(struct mlx5_cqe64));
 }
 
+static void mlx5e_read_enhanced_title_slot(struct mlx5e_rq *rq,
+					   struct mlx5_cqe64 *cqe)
+{
+	struct mlx5e_cq_decomp *cqd = &rq->cqd;
+	struct mlx5_cqe64 *title = &cqd->title;
+
+	memcpy(title, cqe, sizeof(struct mlx5_cqe64));
+
+	if (likely(test_bit(MLX5E_RQ_STATE_MINI_CQE_HW_STRIDX, &rq->state)))
+		return;
+
+	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
+		cqd->wqe_counter = mpwrq_get_cqe_stride_index(title) +
+			mpwrq_get_cqe_consumed_strides(title);
+	else
+		cqd->wqe_counter =
+			mlx5_wq_cyc_ctr2ix(&rq->wqe.wq, be16_to_cpu(title->wqe_counter) + 1);
+}
+
 static inline void mlx5e_read_title_slot(struct mlx5e_rq *rq,
 					 struct mlx5_cqwq *wq,
 					 u32 cqcc)
@@ -175,6 +194,38 @@ static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
 	cqd->title.rss_hash_result = 0;
 }
 
+static u32 mlx5e_decompress_enhanced_cqe(struct mlx5e_rq *rq,
+					 struct mlx5_cqwq *wq,
+					 struct mlx5_cqe64 *cqe,
+					 int budget_rem)
+{
+	struct mlx5e_cq_decomp *cqd = &rq->cqd;
+	u32 cqcc, left;
+	u32 i;
+
+	left = get_cqe_enhanced_num_mini_cqes(cqe);
+	/* Here we avoid breaking the cqe compression session in the middle
+	 * in case budget is not sufficient to handle all of it. In this case
+	 * we return work_done == budget_rem to give 'busy' napi indication.
+	 */
+	if (unlikely(left > budget_rem))
+		return budget_rem;
+
+	cqcc = wq->cc;
+	cqd->mini_arr_idx = 0;
+	memcpy(cqd->mini_arr, cqe, sizeof(struct mlx5_cqe64));
+	for (i = 0; i < left; i++, cqd->mini_arr_idx++, cqcc++) {
+		mlx5e_decompress_cqe_no_hash(rq, wq, cqcc);
+		INDIRECT_CALL_3(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq,
+				mlx5e_handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq_shampo,
+				rq, &cqd->title);
+	}
+	wq->cc = cqcc;
+	rq->stats->cqe_compress_pkts += left;
+
+	return left;
+}
+
 static inline u32 mlx5e_decompress_cqes_cont(struct mlx5e_rq *rq,
 					     struct mlx5_cqwq *wq,
 					     int update_owner_only,
@@ -220,7 +271,7 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
 			rq, &cqd->title);
 	cqd->mini_arr_idx++;
 
-	return mlx5e_decompress_cqes_cont(rq, wq, 1, budget_rem) - 1;
+	return mlx5e_decompress_cqes_cont(rq, wq, 1, budget_rem);
 }
 
 static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq, struct page *page)
@@ -2211,45 +2262,102 @@ mpwrq_cqe_out:
 	mlx5_wq_ll_pop(wq, cqe->wqe_id, &wqe->next.next_wqe_index);
 }
 
-int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
+static int mlx5e_rx_cq_process_enhanced_cqe_comp(struct mlx5e_rq *rq,
+						 struct mlx5_cqwq *cqwq,
+						 int budget_rem)
 {
-	struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
-	struct mlx5_cqwq *cqwq = &cq->wq;
-	struct mlx5_cqe64 *cqe;
+	struct mlx5_cqe64 *cqe, *title_cqe = NULL;
+	struct mlx5e_cq_decomp *cqd = &rq->cqd;
 	int work_done = 0;
 
-	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
-		return 0;
+	cqe = mlx5_cqwq_get_cqe_enahnced_comp(cqwq);
+	if (!cqe)
+		return work_done;
 
-	if (rq->cqd.left) {
-		work_done += mlx5e_decompress_cqes_cont(rq, cqwq, 0, budget);
-		if (work_done >= budget)
-			goto out;
+	if (cqd->last_cqe_title &&
+	    (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED)) {
+		rq->stats->cqe_compress_blks++;
+		cqd->last_cqe_title = false;
 	}
 
-	cqe = mlx5_cqwq_get_cqe(cqwq);
-	if (!cqe) {
-		if (unlikely(work_done))
-			goto out;
-		return 0;
+	do {
+		if (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED) {
+			if (title_cqe) {
+				mlx5e_read_enhanced_title_slot(rq, title_cqe);
+				title_cqe = NULL;
+				rq->stats->cqe_compress_blks++;
+			}
+			work_done +=
+				mlx5e_decompress_enhanced_cqe(rq, cqwq, cqe,
+							      budget_rem - work_done);
+			continue;
+		}
+		title_cqe = cqe;
+		mlx5_cqwq_pop(cqwq);
+
+		INDIRECT_CALL_3(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq,
+				mlx5e_handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq_shampo,
+				rq, cqe);
+		work_done++;
+	} while (work_done < budget_rem &&
+		 (cqe = mlx5_cqwq_get_cqe_enahnced_comp(cqwq)));
+
+	/* last cqe might be title on next poll bulk */
+	if (title_cqe) {
+		mlx5e_read_enhanced_title_slot(rq, title_cqe);
+		cqd->last_cqe_title = true;
 	}
 
-	do {
+	return work_done;
+}
+
+static int mlx5e_rx_cq_process_basic_cqe_comp(struct mlx5e_rq *rq,
+					      struct mlx5_cqwq *cqwq,
+					      int budget_rem)
+{
+	struct mlx5_cqe64 *cqe;
+	int work_done = 0;
+
+	if (rq->cqd.left)
+		work_done += mlx5e_decompress_cqes_cont(rq, cqwq, 0, budget_rem);
+
+	while (work_done < budget_rem && (cqe = mlx5_cqwq_get_cqe(cqwq))) {
 		if (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED) {
 			work_done +=
 				mlx5e_decompress_cqes_start(rq, cqwq,
-							    budget - work_done);
+							    budget_rem - work_done);
 			continue;
 		}
 
 		mlx5_cqwq_pop(cqwq);
-
 		INDIRECT_CALL_3(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq,
 				mlx5e_handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq_shampo,
 				rq, cqe);
-	} while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(cqwq)));
+		work_done++;
+	}
+
+	return work_done;
+}
+
+int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
+{
+	struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
+	struct mlx5_cqwq *cqwq = &cq->wq;
+	int work_done;
+
+	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
+		return 0;
+
+	if (test_bit(MLX5E_RQ_STATE_MINI_CQE_ENHANCED, &rq->state))
+		work_done = mlx5e_rx_cq_process_enhanced_cqe_comp(rq, cqwq,
+								  budget);
+	else
+		work_done = mlx5e_rx_cq_process_basic_cqe_comp(rq, cqwq,
+							       budget);
+
+	if (work_done == 0)
+		return 0;
 
-out:
 	if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state) && rq->hw_gro_data->skb)
 		mlx5e_shampo_flush_skb(rq, NULL, false);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index 4d629e5ddbc7..e4ef1d24a3ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -243,6 +243,23 @@ static inline struct mlx5_cqe64 *mlx5_cqwq_get_cqe(struct mlx5_cqwq *wq)
 	return cqe;
 }
 
+static inline
+struct mlx5_cqe64 *mlx5_cqwq_get_cqe_enahnced_comp(struct mlx5_cqwq *wq)
+{
+	u8 sw_validity_iteration_count = mlx5_cqwq_get_wrap_cnt(wq) & 0xff;
+	u32 ci = mlx5_cqwq_get_ci(wq);
+	struct mlx5_cqe64 *cqe;
+
+	cqe = mlx5_cqwq_get_wqe(wq, ci);
+	if (cqe->validity_iteration_count != sw_validity_iteration_count)
+		return NULL;
+
+	/* ensure cqe content is read after cqe ownership bit/validity byte */
+	dma_rmb();
+
+	return cqe;
+}
+
 static inline u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq)
 {
 	return (u32)wq->fbc.sz_m1 + 1;
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 1ff91cb79ded..eb3fac30488b 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -882,6 +882,12 @@ static inline u8 get_cqe_opcode(struct mlx5_cqe64 *cqe)
 	return cqe->op_own >> 4;
 }
 
+static inline u8 get_cqe_enhanced_num_mini_cqes(struct mlx5_cqe64 *cqe)
+{
+	/* num_of_mini_cqes is zero based */
+	return get_cqe_opcode(cqe) + 1;
+}
+
 static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
 {
 	return (cqe->lro.tcppsh_abort_dupack >> 6) & 1;
-- 
cgit v1.2.3


From 92125c3a602454824d70edb1b2abb382811cab4f Mon Sep 17 00:00:00 2001
From: Nick Child <nnac123@linux.ibm.com>
Date: Thu, 10 Nov 2022 15:32:17 -0600
Subject: ibmvnic: Add hotpluggable CPU callbacks to reassign affinity hints

When CPU's are added and removed, ibmvnic devices will reassign
hint values. Introduce a new cpu hotplug state CPUHP_IBMVNIC_DEAD
to signal to ibmvnic devices that the CPU has been removed and it
is time to reset affinity hint assignments. On the other hand,
when CPU's are being added, add a state instance to
CPUHP_AP_ONLINE_DYN which will trigger a reassignment of affinity
hints once the new CPU's are online. This implementation is based
on the virtio_net driver.

Signed-off-by: Thomas Falcon <tlfalcon@linux.ibm.com>
Signed-off-by: Dany Madden <drt@linux.ibm.com>
Signed-off-by: Nick Child <nnac123@linux.ibm.com>
Reviewed-by: Rick Lindsley <ricklind@linux.ibm.com>
Reviewed-by: Haren Myneni <haren@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 89 +++++++++++++++++++++++++++++++++++++-
 drivers/net/ethernet/ibm/ibmvnic.h |  4 ++
 include/linux/cpuhotplug.h         |  1 +
 3 files changed, 93 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 0c969bdaf94d..2fc0d50dbb86 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -298,6 +298,57 @@ out:
 	}
 }
 
+static int ibmvnic_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct ibmvnic_adapter *adapter;
+
+	adapter = hlist_entry_safe(node, struct ibmvnic_adapter, node);
+	ibmvnic_set_affinity(adapter);
+	return 0;
+}
+
+static int ibmvnic_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+	struct ibmvnic_adapter *adapter;
+
+	adapter = hlist_entry_safe(node, struct ibmvnic_adapter, node_dead);
+	ibmvnic_set_affinity(adapter);
+	return 0;
+}
+
+static int ibmvnic_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
+{
+	struct ibmvnic_adapter *adapter;
+
+	adapter = hlist_entry_safe(node, struct ibmvnic_adapter, node);
+	ibmvnic_clean_affinity(adapter);
+	return 0;
+}
+
+static enum cpuhp_state ibmvnic_online;
+
+static int ibmvnic_cpu_notif_add(struct ibmvnic_adapter *adapter)
+{
+	int ret;
+
+	ret = cpuhp_state_add_instance_nocalls(ibmvnic_online, &adapter->node);
+	if (ret)
+		return ret;
+	ret = cpuhp_state_add_instance_nocalls(CPUHP_IBMVNIC_DEAD,
+					       &adapter->node_dead);
+	if (!ret)
+		return ret;
+	cpuhp_state_remove_instance_nocalls(ibmvnic_online, &adapter->node);
+	return ret;
+}
+
+static void ibmvnic_cpu_notif_remove(struct ibmvnic_adapter *adapter)
+{
+	cpuhp_state_remove_instance_nocalls(ibmvnic_online, &adapter->node);
+	cpuhp_state_remove_instance_nocalls(CPUHP_IBMVNIC_DEAD,
+					    &adapter->node_dead);
+}
+
 static long h_reg_sub_crq(unsigned long unit_address, unsigned long token,
 			  unsigned long length, unsigned long *number,
 			  unsigned long *irq)
@@ -6292,10 +6343,19 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	}
 	dev_info(&dev->dev, "ibmvnic registered\n");
 
+	rc = ibmvnic_cpu_notif_add(adapter);
+	if (rc) {
+		netdev_err(netdev, "Registering cpu notifier failed\n");
+		goto cpu_notif_add_failed;
+	}
+
 	complete(&adapter->probe_done);
 
 	return 0;
 
+cpu_notif_add_failed:
+	unregister_netdev(netdev);
+
 ibmvnic_register_fail:
 	device_remove_file(&dev->dev, &dev_attr_failover);
 
@@ -6346,6 +6406,8 @@ static void ibmvnic_remove(struct vio_dev *dev)
 
 	spin_unlock_irqrestore(&adapter->state_lock, flags);
 
+	ibmvnic_cpu_notif_remove(adapter);
+
 	flush_work(&adapter->ibmvnic_reset);
 	flush_delayed_work(&adapter->ibmvnic_delayed_reset);
 
@@ -6476,15 +6538,40 @@ static struct vio_driver ibmvnic_driver = {
 /* module functions */
 static int __init ibmvnic_module_init(void)
 {
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "net/ibmvnic:online",
+				      ibmvnic_cpu_online,
+				      ibmvnic_cpu_down_prep);
+	if (ret < 0)
+		goto out;
+	ibmvnic_online = ret;
+	ret = cpuhp_setup_state_multi(CPUHP_IBMVNIC_DEAD, "net/ibmvnic:dead",
+				      NULL, ibmvnic_cpu_dead);
+	if (ret)
+		goto err_dead;
+
+	ret = vio_register_driver(&ibmvnic_driver);
+	if (ret)
+		goto err_vio_register;
+
 	pr_info("%s: %s %s\n", ibmvnic_driver_name, ibmvnic_driver_string,
 		IBMVNIC_DRIVER_VERSION);
 
-	return vio_register_driver(&ibmvnic_driver);
+	return 0;
+err_vio_register:
+	cpuhp_remove_multi_state(CPUHP_IBMVNIC_DEAD);
+err_dead:
+	cpuhp_remove_multi_state(ibmvnic_online);
+out:
+	return ret;
 }
 
 static void __exit ibmvnic_module_exit(void)
 {
 	vio_unregister_driver(&ibmvnic_driver);
+	cpuhp_remove_multi_state(CPUHP_IBMVNIC_DEAD);
+	cpuhp_remove_multi_state(ibmvnic_online);
 }
 
 module_init(ibmvnic_module_init);
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 6720fec1ae67..b35c9b6f913b 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -984,6 +984,10 @@ struct ibmvnic_adapter {
 	int reset_done_rc;
 	bool wait_for_reset;
 
+	/* CPU hotplug instances for online & dead */
+	struct hlist_node node;
+	struct hlist_node node_dead;
+
 	/* partner capabilities */
 	u64 min_tx_queues;
 	u64 min_rx_queues;
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index f61447913db9..c8bc85a87b1e 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -69,6 +69,7 @@ enum cpuhp_state {
 	CPUHP_X86_APB_DEAD,
 	CPUHP_X86_MCE_DEAD,
 	CPUHP_VIRT_NET_DEAD,
+	CPUHP_IBMVNIC_DEAD,
 	CPUHP_SLUB_DEAD,
 	CPUHP_DEBUG_OBJ_DEAD,
 	CPUHP_MM_WRITEBACK_DEAD,
-- 
cgit v1.2.3


From 2d577252579b3efb9e934b68948a2edfa9920110 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 15 Nov 2022 00:45:23 +0530
Subject: bpf: Remove BPF_MAP_OFF_ARR_MAX

In f71b2f64177a ("bpf: Refactor map->off_arr handling"), map->off_arr
was refactored to be btf_field_offs. The number of field offsets is
equal to maximum possible fields limited by BTF_FIELDS_MAX. Hence, reuse
BTF_FIELDS_MAX as spin_lock and timer no longer are to be handled
specially for offset sorting, fix the comment, and remove incorrect
WARN_ON as its rec->cnt can never exceed this value. The reason to keep
separate constant was the it was always more 2 more than total kptrs.
This is no longer the case.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221114191547.1694267-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 9 ++++-----
 kernel/bpf/btf.c    | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 798aec816970..1a66a1df1af1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -165,9 +165,8 @@ struct bpf_map_ops {
 };
 
 enum {
-	/* Support at most 8 pointers in a BTF type */
-	BTF_FIELDS_MAX	      = 10,
-	BPF_MAP_OFF_ARR_MAX   = BTF_FIELDS_MAX,
+	/* Support at most 10 fields in a BTF type */
+	BTF_FIELDS_MAX	   = 10,
 };
 
 enum btf_field_type {
@@ -203,8 +202,8 @@ struct btf_record {
 
 struct btf_field_offs {
 	u32 cnt;
-	u32 field_off[BPF_MAP_OFF_ARR_MAX];
-	u8 field_sz[BPF_MAP_OFF_ARR_MAX];
+	u32 field_off[BTF_FIELDS_MAX];
+	u8 field_sz[BTF_FIELDS_MAX];
 };
 
 struct bpf_map {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5579ff3a5b54..12361d7b2498 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3584,7 +3584,7 @@ struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec)
 	u8 *sz;
 
 	BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz));
-	if (IS_ERR_OR_NULL(rec) || WARN_ON_ONCE(rec->cnt > sizeof(foffs->field_off)))
+	if (IS_ERR_OR_NULL(rec))
 		return NULL;
 
 	foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN);
-- 
cgit v1.2.3


From e5feed0f64f73e167ef70755d3dc2db959d8fd5c Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 15 Nov 2022 00:45:24 +0530
Subject: bpf: Fix copy_map_value, zero_map_value

The current offset needs to also skip over the already copied region in
addition to the size of the next field. This case manifests where there
are gaps between adjacent special fields.

It was observed that for a map value with size 48, having fields at:
off:  0, 16, 32
size: 4, 16, 16

The current code does:

memcpy(dst + 0, src + 0, 0)
memcpy(dst + 4, src + 4, 12)
memcpy(dst + 20, src + 20, 12)
memcpy(dst + 36, src + 36, 12)

With the fix, it is done correctly as:

memcpy(dst + 0, src + 0, 0)
memcpy(dst + 4, src + 4, 12)
memcpy(dst + 32, src + 32, 0)
memcpy(dst + 48, src + 48, 0)

Fixes: 4d7d7f69f4b1 ("bpf: Adapt copy_map_value for multiple offset case")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221114191547.1694267-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1a66a1df1af1..f08eb2d27de0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -360,7 +360,7 @@ static inline void bpf_obj_memcpy(struct btf_field_offs *foffs,
 		u32 sz = next_off - curr_off;
 
 		memcpy(dst + curr_off, src + curr_off, sz);
-		curr_off += foffs->field_sz[i];
+		curr_off += foffs->field_sz[i] + sz;
 	}
 	memcpy(dst + curr_off, src + curr_off, size - curr_off);
 }
@@ -390,7 +390,7 @@ static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32
 		u32 sz = next_off - curr_off;
 
 		memset(dst + curr_off, 0, sz);
-		curr_off += foffs->field_sz[i];
+		curr_off += foffs->field_sz[i] + sz;
 	}
 	memset(dst + curr_off, 0, size - curr_off);
 }
-- 
cgit v1.2.3


From f0c5941ff5b255413d31425bb327c2aec3625673 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 15 Nov 2022 00:45:25 +0530
Subject: bpf: Support bpf_list_head in map values

Add the support on the map side to parse, recognize, verify, and build
metadata table for a new special field of the type struct bpf_list_head.
To parameterize the bpf_list_head for a certain value type and the
list_node member it will accept in that value type, we use BTF
declaration tags.

The definition of bpf_list_head in a map value will be done as follows:

struct foo {
	struct bpf_list_node node;
	int data;
};

struct map_value {
	struct bpf_list_head head __contains(foo, node);
};

Then, the bpf_list_head only allows adding to the list 'head' using the
bpf_list_node 'node' for the type struct foo.

The 'contains' annotation is a BTF declaration tag composed of four
parts, "contains:name:node" where the name is then used to look up the
type in the map BTF, with its kind hardcoded to BTF_KIND_STRUCT during
the lookup. The node defines name of the member in this type that has
the type struct bpf_list_node, which is actually used for linking into
the linked list. For now, 'kind' part is hardcoded as struct.

This allows building intrusive linked lists in BPF, using container_of
to obtain pointer to entry, while being completely type safe from the
perspective of the verifier. The verifier knows exactly the type of the
nodes, and knows that list helpers return that type at some fixed offset
where the bpf_list_node member used for this list exists. The verifier
also uses this information to disallow adding types that are not
accepted by a certain list.

For now, no elements can be added to such lists. Support for that is
coming in future patches, hence draining and freeing items is done with
a TODO that will be resolved in a future patch.

Note that the bpf_list_head_free function moves the list out to a local
variable under the lock and releases it, doing the actual draining of
the list items outside the lock. While this helps with not holding the
lock for too long pessimizing other concurrent list operations, it is
also necessary for deadlock prevention: unless every function called in
the critical section would be notrace, a fentry/fexit program could
attach and call bpf_map_update_elem again on the map, leading to the
same lock being acquired if the key matches and lead to a deadlock.
While this requires some special effort on part of the BPF programmer to
trigger and is highly unlikely to occur in practice, it is always better
if we can avoid such a condition.

While notrace would prevent this, doing the draining outside the lock
has advantages of its own, hence it is used to also fix the deadlock
related problem.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221114191547.1694267-5-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  17 +++++
 include/uapi/linux/bpf.h       |  10 +++
 kernel/bpf/btf.c               | 145 ++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/helpers.c           |  32 +++++++++
 kernel/bpf/syscall.c           |  22 ++++++-
 kernel/bpf/verifier.c          |   7 ++
 tools/include/uapi/linux/bpf.h |  10 +++
 7 files changed, 239 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f08eb2d27de0..05f98e9e5c48 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -175,6 +175,7 @@ enum btf_field_type {
 	BPF_KPTR_UNREF = (1 << 2),
 	BPF_KPTR_REF   = (1 << 3),
 	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
+	BPF_LIST_HEAD  = (1 << 4),
 };
 
 struct btf_field_kptr {
@@ -184,11 +185,18 @@ struct btf_field_kptr {
 	u32 btf_id;
 };
 
+struct btf_field_list_head {
+	struct btf *btf;
+	u32 value_btf_id;
+	u32 node_offset;
+};
+
 struct btf_field {
 	u32 offset;
 	enum btf_field_type type;
 	union {
 		struct btf_field_kptr kptr;
+		struct btf_field_list_head list_head;
 	};
 };
 
@@ -266,6 +274,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 		return "kptr";
+	case BPF_LIST_HEAD:
+		return "bpf_list_head";
 	default:
 		WARN_ON_ONCE(1);
 		return "unknown";
@@ -282,6 +292,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 		return sizeof(u64);
+	case BPF_LIST_HEAD:
+		return sizeof(struct bpf_list_head);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -298,6 +310,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 		return __alignof__(u64);
+	case BPF_LIST_HEAD:
+		return __alignof__(struct bpf_list_head);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -403,6 +417,9 @@ static inline void zero_map_value(struct bpf_map *map, void *dst)
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
 			   bool lock_src);
 void bpf_timer_cancel_and_free(void *timer);
+void bpf_list_head_free(const struct btf_field *field, void *list_head,
+			struct bpf_spin_lock *spin_lock);
+
 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
 
 struct bpf_offload_dev;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fb4c911d2a03..6580448e9f77 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6888,6 +6888,16 @@ struct bpf_dynptr {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_list_head {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_list_node {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 12361d7b2498..c0d73d71c539 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3205,9 +3205,15 @@ enum {
 struct btf_field_info {
 	enum btf_field_type type;
 	u32 off;
-	struct {
-		u32 type_id;
-	} kptr;
+	union {
+		struct {
+			u32 type_id;
+		} kptr;
+		struct {
+			const char *node_name;
+			u32 value_btf_id;
+		} list_head;
+	};
 };
 
 static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
@@ -3261,6 +3267,63 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 	return BTF_FIELD_FOUND;
 }
 
+static const char *btf_find_decl_tag_value(const struct btf *btf,
+					   const struct btf_type *pt,
+					   int comp_idx, const char *tag_key)
+{
+	int i;
+
+	for (i = 1; i < btf_nr_types(btf); i++) {
+		const struct btf_type *t = btf_type_by_id(btf, i);
+		int len = strlen(tag_key);
+
+		if (!btf_type_is_decl_tag(t))
+			continue;
+		if (pt != btf_type_by_id(btf, t->type) ||
+		    btf_type_decl_tag(t)->component_idx != comp_idx)
+			continue;
+		if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
+			continue;
+		return __btf_name_by_offset(btf, t->name_off) + len;
+	}
+	return NULL;
+}
+
+static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt,
+			      const struct btf_type *t, int comp_idx,
+			      u32 off, int sz, struct btf_field_info *info)
+{
+	const char *value_type;
+	const char *list_node;
+	s32 id;
+
+	if (!__btf_type_is_struct(t))
+		return BTF_FIELD_IGNORE;
+	if (t->size != sz)
+		return BTF_FIELD_IGNORE;
+	value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
+	if (!value_type)
+		return -EINVAL;
+	list_node = strstr(value_type, ":");
+	if (!list_node)
+		return -EINVAL;
+	value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN);
+	if (!value_type)
+		return -ENOMEM;
+	id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
+	kfree(value_type);
+	if (id < 0)
+		return id;
+	list_node++;
+	if (str_is_empty(list_node))
+		return -EINVAL;
+	info->type = BPF_LIST_HEAD;
+	info->off = off;
+	info->list_head.value_btf_id = id;
+	info->list_head.node_name = list_node;
+	return BTF_FIELD_FOUND;
+}
+
 static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
 			      int *align, int *sz)
 {
@@ -3284,6 +3347,12 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
 			goto end;
 		}
 	}
+	if (field_mask & BPF_LIST_HEAD) {
+		if (!strcmp(name, "bpf_list_head")) {
+			type = BPF_LIST_HEAD;
+			goto end;
+		}
+	}
 	/* Only return BPF_KPTR when all other types with matchable names fail */
 	if (field_mask & BPF_KPTR) {
 		type = BPF_KPTR_REF;
@@ -3339,6 +3408,12 @@ static int btf_find_struct_field(const struct btf *btf,
 			if (ret < 0)
 				return ret;
 			break;
+		case BPF_LIST_HEAD:
+			ret = btf_find_list_head(btf, t, member_type, i, off, sz,
+						 idx < info_cnt ? &info[idx] : &tmp);
+			if (ret < 0)
+				return ret;
+			break;
 		default:
 			return -EFAULT;
 		}
@@ -3393,6 +3468,12 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 			if (ret < 0)
 				return ret;
 			break;
+		case BPF_LIST_HEAD:
+			ret = btf_find_list_head(btf, var, var_type, -1, off, sz,
+						 idx < info_cnt ? &info[idx] : &tmp);
+			if (ret < 0)
+				return ret;
+			break;
 		default:
 			return -EFAULT;
 		}
@@ -3491,11 +3572,52 @@ end_btf:
 	return ret;
 }
 
+static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
+			       struct btf_field_info *info)
+{
+	const struct btf_type *t, *n = NULL;
+	const struct btf_member *member;
+	u32 offset;
+	int i;
+
+	t = btf_type_by_id(btf, info->list_head.value_btf_id);
+	/* We've already checked that value_btf_id is a struct type. We
+	 * just need to figure out the offset of the list_node, and
+	 * verify its type.
+	 */
+	for_each_member(i, t, member) {
+		if (strcmp(info->list_head.node_name, __btf_name_by_offset(btf, member->name_off)))
+			continue;
+		/* Invalid BTF, two members with same name */
+		if (n)
+			return -EINVAL;
+		n = btf_type_by_id(btf, member->type);
+		if (!__btf_type_is_struct(n))
+			return -EINVAL;
+		if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off)))
+			return -EINVAL;
+		offset = __btf_member_bit_offset(n, member);
+		if (offset % 8)
+			return -EINVAL;
+		offset /= 8;
+		if (offset % __alignof__(struct bpf_list_node))
+			return -EINVAL;
+
+		field->list_head.btf = (struct btf *)btf;
+		field->list_head.value_btf_id = info->list_head.value_btf_id;
+		field->list_head.node_offset = offset;
+	}
+	if (!n)
+		return -ENOENT;
+	return 0;
+}
+
 struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
 				    u32 field_mask, u32 value_size)
 {
 	struct btf_field_info info_arr[BTF_FIELDS_MAX];
 	struct btf_record *rec;
+	u32 next_off = 0;
 	int ret, i, cnt;
 
 	ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
@@ -3517,6 +3639,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			ret = -EFAULT;
 			goto end;
 		}
+		if (info_arr[i].off < next_off) {
+			ret = -EEXIST;
+			goto end;
+		}
+		next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type);
 
 		rec->field_mask |= info_arr[i].type;
 		rec->fields[i].offset = info_arr[i].off;
@@ -3539,12 +3666,24 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			if (ret < 0)
 				goto end;
 			break;
+		case BPF_LIST_HEAD:
+			ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
+			if (ret < 0)
+				goto end;
+			break;
 		default:
 			ret = -EFAULT;
 			goto end;
 		}
 		rec->cnt++;
 	}
+
+	/* bpf_list_head requires bpf_spin_lock */
+	if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) {
+		ret = -EINVAL;
+		goto end;
+	}
+
 	return rec;
 end:
 	btf_record_free(rec);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 283f55bbeb70..7bc71995f17c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1706,6 +1706,38 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+void bpf_list_head_free(const struct btf_field *field, void *list_head,
+			struct bpf_spin_lock *spin_lock)
+{
+	struct list_head *head = list_head, *orig_head = list_head;
+
+	BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
+	BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+
+	/* Do the actual list draining outside the lock to not hold the lock for
+	 * too long, and also prevent deadlocks if tracing programs end up
+	 * executing on entry/exit of functions called inside the critical
+	 * section, and end up doing map ops that call bpf_list_head_free for
+	 * the same map value again.
+	 */
+	__bpf_spin_lock_irqsave(spin_lock);
+	if (!head->next || list_empty(head))
+		goto unlock;
+	head = head->next;
+unlock:
+	INIT_LIST_HEAD(orig_head);
+	__bpf_spin_unlock_irqrestore(spin_lock);
+
+	while (head != orig_head) {
+		void *obj = head;
+
+		obj -= field->list_head.node_offset;
+		head = head->next;
+		/* TODO: Rework later */
+		kfree(obj);
+	}
+}
+
 BTF_SET8_START(tracing_btf_ids)
 #ifdef CONFIG_KEXEC_CORE
 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 85532d301124..fdbae52f463f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -536,6 +536,9 @@ void btf_record_free(struct btf_record *rec)
 				module_put(rec->fields[i].kptr.module);
 			btf_put(rec->fields[i].kptr.btf);
 			break;
+		case BPF_LIST_HEAD:
+			/* Nothing to release for bpf_list_head */
+			break;
 		default:
 			WARN_ON_ONCE(1);
 			continue;
@@ -578,6 +581,9 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 				goto free;
 			}
 			break;
+		case BPF_LIST_HEAD:
+			/* Nothing to acquire for bpf_list_head */
+			break;
 		default:
 			ret = -EFAULT;
 			WARN_ON_ONCE(1);
@@ -637,6 +643,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 		case BPF_KPTR_REF:
 			field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
 			break;
+		case BPF_LIST_HEAD:
+			if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+				continue;
+			bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
+			break;
 		default:
 			WARN_ON_ONCE(1);
 			continue;
@@ -965,7 +976,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 	if (!value_type || value_size != map->value_size)
 		return -EINVAL;
 
-	map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR,
+	map->record = btf_parse_fields(btf, value_type,
+				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD,
 				       map->value_size);
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
@@ -1012,6 +1024,14 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 					goto free_map_tab;
 				}
 				break;
+			case BPF_LIST_HEAD:
+				if (map->map_type != BPF_MAP_TYPE_HASH &&
+				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+				    map->map_type != BPF_MAP_TYPE_ARRAY) {
+					ret = -EOPNOTSUPP;
+					goto free_map_tab;
+				}
+				break;
 			default:
 				/* Fail if map_type checks are missing for a field type */
 				ret = -EOPNOTSUPP;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 07c0259dfc1a..a50018e2d4a0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12814,6 +12814,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(prog);
 
+	if (btf_record_has_field(map->record, BPF_LIST_HEAD)) {
+		if (is_tracing_prog_type(prog_type)) {
+			verbose(env, "tracing progs cannot use bpf_list_head yet\n");
+			return -EINVAL;
+		}
+	}
+
 	if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
 		if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
 			verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fb4c911d2a03..6580448e9f77 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6888,6 +6888,16 @@ struct bpf_dynptr {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_list_head {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_list_node {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
-- 
cgit v1.2.3


From 2de2669b4e52b2ae2f118bfc310004f50b47f0f5 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 15 Nov 2022 00:45:26 +0530
Subject: bpf: Rename RET_PTR_TO_ALLOC_MEM

Currently, the verifier has two return types, RET_PTR_TO_ALLOC_MEM, and
RET_PTR_TO_ALLOC_MEM_OR_NULL, however the former is confusingly named to
imply that it carries MEM_ALLOC, while only the latter does. This causes
confusion during code review leading to conclusions like that the return
value of RET_PTR_TO_DYNPTR_MEM_OR_NULL (which is RET_PTR_TO_ALLOC_MEM |
PTR_MAYBE_NULL) may be consumable by bpf_ringbuf_{submit,commit}.

Rename it to make it clear MEM_ALLOC needs to be tacked on top of
RET_PTR_TO_MEM.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221114191547.1694267-6-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   | 6 +++---
 kernel/bpf/verifier.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 05f98e9e5c48..2fe3ec620d54 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -607,7 +607,7 @@ enum bpf_return_type {
 	RET_PTR_TO_SOCKET,		/* returns a pointer to a socket */
 	RET_PTR_TO_TCP_SOCK,		/* returns a pointer to a tcp_sock */
 	RET_PTR_TO_SOCK_COMMON,		/* returns a pointer to a sock_common */
-	RET_PTR_TO_ALLOC_MEM,		/* returns a pointer to dynamically allocated memory */
+	RET_PTR_TO_MEM,			/* returns a pointer to memory */
 	RET_PTR_TO_MEM_OR_BTF_ID,	/* returns a pointer to a valid memory or a btf_id */
 	RET_PTR_TO_BTF_ID,		/* returns a pointer to a btf_id */
 	__BPF_RET_TYPE_MAX,
@@ -617,8 +617,8 @@ enum bpf_return_type {
 	RET_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
 	RET_PTR_TO_TCP_SOCK_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
 	RET_PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
-	RET_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM,
-	RET_PTR_TO_DYNPTR_MEM_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM,
+	RET_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_MEM,
+	RET_PTR_TO_DYNPTR_MEM_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_MEM,
 	RET_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
 
 	/* This must be the last entry. Its purpose is to ensure the enum is
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a50018e2d4a0..c88da7e3ca74 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7630,7 +7630,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
 		break;
-	case RET_PTR_TO_ALLOC_MEM:
+	case RET_PTR_TO_MEM:
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
 		regs[BPF_REG_0].mem_size = meta.mem_size;
-- 
cgit v1.2.3


From 894f2a8b1673a355a1a7507a4dfa6a3c836d07c1 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 15 Nov 2022 00:45:27 +0530
Subject: bpf: Rename MEM_ALLOC to MEM_RINGBUF

Currently, verifier uses MEM_ALLOC type tag to specially tag memory
returned from bpf_ringbuf_reserve helper. However, this is currently
only used for this purpose and there is an implicit assumption that it
only refers to ringbuf memory (e.g. the check for ARG_PTR_TO_ALLOC_MEM
in check_func_arg_reg_off).

Hence, rename MEM_ALLOC to MEM_RINGBUF to indicate this special
relationship and instead open the use of MEM_ALLOC for more generic
allocations made for user types.

Also, since ARG_PTR_TO_ALLOC_MEM_OR_NULL is unused, simply drop it.

Finally, update selftests using 'alloc_' verifier string to 'ringbuf_'.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221114191547.1694267-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                               | 11 ++++-------
 kernel/bpf/ringbuf.c                              |  6 +++---
 kernel/bpf/verifier.c                             | 14 +++++++-------
 tools/testing/selftests/bpf/prog_tests/dynptr.c   |  2 +-
 tools/testing/selftests/bpf/verifier/ringbuf.c    |  2 +-
 tools/testing/selftests/bpf/verifier/spill_fill.c |  2 +-
 6 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2fe3ec620d54..afc1c51b59ff 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -488,10 +488,8 @@ enum bpf_type_flag {
 	 */
 	MEM_RDONLY		= BIT(1 + BPF_BASE_TYPE_BITS),
 
-	/* MEM was "allocated" from a different helper, and cannot be mixed
-	 * with regular non-MEM_ALLOC'ed MEM types.
-	 */
-	MEM_ALLOC		= BIT(2 + BPF_BASE_TYPE_BITS),
+	/* MEM points to BPF ring buffer reservation. */
+	MEM_RINGBUF		= BIT(2 + BPF_BASE_TYPE_BITS),
 
 	/* MEM is in user address space. */
 	MEM_USER		= BIT(3 + BPF_BASE_TYPE_BITS),
@@ -565,7 +563,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_LONG,	/* pointer to long */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */
 	ARG_PTR_TO_BTF_ID,	/* pointer to in-kernel struct */
-	ARG_PTR_TO_ALLOC_MEM,	/* pointer to dynamically allocated memory */
+	ARG_PTR_TO_RINGBUF_MEM,	/* pointer to dynamically reserved ringbuf memory */
 	ARG_CONST_ALLOC_SIZE_OR_ZERO,	/* number of allocated bytes requested */
 	ARG_PTR_TO_BTF_ID_SOCK_COMMON,	/* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
 	ARG_PTR_TO_PERCPU_BTF_ID,	/* pointer to in-kernel percpu type */
@@ -582,7 +580,6 @@ enum bpf_arg_type {
 	ARG_PTR_TO_MEM_OR_NULL		= PTR_MAYBE_NULL | ARG_PTR_TO_MEM,
 	ARG_PTR_TO_CTX_OR_NULL		= PTR_MAYBE_NULL | ARG_PTR_TO_CTX,
 	ARG_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
-	ARG_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM,
 	ARG_PTR_TO_STACK_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_STACK,
 	ARG_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID,
 	/* pointer to memory does not need to be initialized, helper function must fill
@@ -617,7 +614,7 @@ enum bpf_return_type {
 	RET_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
 	RET_PTR_TO_TCP_SOCK_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
 	RET_PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
-	RET_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_MEM,
+	RET_PTR_TO_RINGBUF_MEM_OR_NULL	= PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM,
 	RET_PTR_TO_DYNPTR_MEM_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_MEM,
 	RET_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
 
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 9e832acf4692..80f4b4d88aaf 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -447,7 +447,7 @@ BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
 
 const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
 	.func		= bpf_ringbuf_reserve,
-	.ret_type	= RET_PTR_TO_ALLOC_MEM_OR_NULL,
+	.ret_type	= RET_PTR_TO_RINGBUF_MEM_OR_NULL,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
 	.arg3_type	= ARG_ANYTHING,
@@ -490,7 +490,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
 const struct bpf_func_proto bpf_ringbuf_submit_proto = {
 	.func		= bpf_ringbuf_submit,
 	.ret_type	= RET_VOID,
-	.arg1_type	= ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
+	.arg1_type	= ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
 	.arg2_type	= ARG_ANYTHING,
 };
 
@@ -503,7 +503,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
 const struct bpf_func_proto bpf_ringbuf_discard_proto = {
 	.func		= bpf_ringbuf_discard,
 	.ret_type	= RET_VOID,
-	.arg1_type	= ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
+	.arg1_type	= ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
 	.arg2_type	= ARG_ANYTHING,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c88da7e3ca74..c588e5483540 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -577,8 +577,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 
 	if (type & MEM_RDONLY)
 		strncpy(prefix, "rdonly_", 32);
-	if (type & MEM_ALLOC)
-		strncpy(prefix, "alloc_", 32);
+	if (type & MEM_RINGBUF)
+		strncpy(prefix, "ringbuf_", 32);
 	if (type & MEM_USER)
 		strncpy(prefix, "user_", 32);
 	if (type & MEM_PERCPU)
@@ -5785,7 +5785,7 @@ static const struct bpf_reg_types mem_types = {
 		PTR_TO_MAP_KEY,
 		PTR_TO_MAP_VALUE,
 		PTR_TO_MEM,
-		PTR_TO_MEM | MEM_ALLOC,
+		PTR_TO_MEM | MEM_RINGBUF,
 		PTR_TO_BUF,
 	},
 };
@@ -5803,7 +5803,7 @@ static const struct bpf_reg_types int_ptr_types = {
 static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
 static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
 static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
-static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } };
+static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
 static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
 static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
 static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
@@ -5836,7 +5836,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_BTF_ID]		= &btf_ptr_types,
 	[ARG_PTR_TO_SPIN_LOCK]		= &spin_lock_types,
 	[ARG_PTR_TO_MEM]		= &mem_types,
-	[ARG_PTR_TO_ALLOC_MEM]		= &alloc_mem_types,
+	[ARG_PTR_TO_RINGBUF_MEM]	= &ringbuf_mem_types,
 	[ARG_PTR_TO_INT]		= &int_ptr_types,
 	[ARG_PTR_TO_LONG]		= &int_ptr_types,
 	[ARG_PTR_TO_PERCPU_BTF_ID]	= &percpu_btf_ptr_types,
@@ -5957,14 +5957,14 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	case PTR_TO_MAP_VALUE:
 	case PTR_TO_MEM:
 	case PTR_TO_MEM | MEM_RDONLY:
-	case PTR_TO_MEM | MEM_ALLOC:
+	case PTR_TO_MEM | MEM_RINGBUF:
 	case PTR_TO_BUF:
 	case PTR_TO_BUF | MEM_RDONLY:
 	case SCALAR_VALUE:
 		/* Some of the argument types nevertheless require a
 		 * zero register offset.
 		 */
-		if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM)
+		if (base_type(arg_type) != ARG_PTR_TO_RINGBUF_MEM)
 			return 0;
 		break;
 	/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
index 8fc4e6c02bfd..b0c06f821cb8 100644
--- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
@@ -17,7 +17,7 @@ static struct {
 	{"ringbuf_missing_release2", "Unreleased reference id=2"},
 	{"ringbuf_missing_release_callback", "Unreleased reference id"},
 	{"use_after_invalid", "Expected an initialized dynptr as arg #3"},
-	{"ringbuf_invalid_api", "type=mem expected=alloc_mem"},
+	{"ringbuf_invalid_api", "type=mem expected=ringbuf_mem"},
 	{"add_dynptr_to_map1", "invalid indirect read from stack"},
 	{"add_dynptr_to_map2", "invalid indirect read from stack"},
 	{"data_slice_out_of_bounds_ringbuf", "value is outside of the allowed memory range"},
diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c
index b64d33e4833c..84838feba47f 100644
--- a/tools/testing/selftests/bpf/verifier/ringbuf.c
+++ b/tools/testing/selftests/bpf/verifier/ringbuf.c
@@ -28,7 +28,7 @@
 	},
 	.fixup_map_ringbuf = { 1 },
 	.result = REJECT,
-	.errstr = "dereference of modified alloc_mem ptr R1",
+	.errstr = "dereference of modified ringbuf_mem ptr R1",
 },
 {
 	"ringbuf: invalid reservation offset 2",
diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c
index e23f07175e1b..9bb302dade23 100644
--- a/tools/testing/selftests/bpf/verifier/spill_fill.c
+++ b/tools/testing/selftests/bpf/verifier/spill_fill.c
@@ -84,7 +84,7 @@
 	},
 	.fixup_map_ringbuf = { 1 },
 	.result = REJECT,
-	.errstr = "R0 pointer arithmetic on alloc_mem_or_null prohibited",
+	.errstr = "R0 pointer arithmetic on ringbuf_mem_or_null prohibited",
 },
 {
 	"check corrupted spill/fill",
-- 
cgit v1.2.3


From 6728aea7216c0c06c98e2e58d753a5e8b2ae1c6f Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 15 Nov 2022 00:45:28 +0530
Subject: bpf: Refactor btf_struct_access

Instead of having to pass multiple arguments that describe the register,
pass the bpf_reg_state into the btf_struct_access callback. Currently,
all call sites simply reuse the btf and btf_id of the reg they want to
check the access of. The only exception to this pattern is the callsite
in check_ptr_to_map_access, hence for that case create a dummy reg to
simulate PTR_TO_BTF_ID access.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221114191547.1694267-8-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h              | 17 ++++++++---------
 include/linux/filter.h           |  8 ++++----
 kernel/bpf/btf.c                 | 11 +++++++----
 kernel/bpf/verifier.c            | 12 +++++++-----
 net/bpf/bpf_dummy_struct_ops.c   | 14 +++++++-------
 net/core/filter.c                | 34 ++++++++++++++--------------------
 net/ipv4/bpf_tcp_ca.c            | 13 ++++++-------
 net/netfilter/nf_conntrack_bpf.c | 17 +++++++----------
 8 files changed, 60 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index afc1c51b59ff..49f9d2bec401 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -771,6 +771,7 @@ struct bpf_prog_ops {
 			union bpf_attr __user *uattr);
 };
 
+struct bpf_reg_state;
 struct bpf_verifier_ops {
 	/* return eBPF function prototype for verification */
 	const struct bpf_func_proto *
@@ -792,9 +793,8 @@ struct bpf_verifier_ops {
 				  struct bpf_insn *dst,
 				  struct bpf_prog *prog, u32 *target_size);
 	int (*btf_struct_access)(struct bpf_verifier_log *log,
-				 const struct btf *btf,
-				 const struct btf_type *t, int off, int size,
-				 enum bpf_access_type atype,
+				 const struct bpf_reg_state *reg,
+				 int off, int size, enum bpf_access_type atype,
 				 u32 *next_btf_id, enum bpf_type_flag *flag);
 };
 
@@ -2080,9 +2080,9 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size,
 	return btf_ctx_access(off, size, type, prog, info);
 }
 
-int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
-		      const struct btf_type *t, int off, int size,
-		      enum bpf_access_type atype,
+int btf_struct_access(struct bpf_verifier_log *log,
+		      const struct bpf_reg_state *reg,
+		      int off, int size, enum bpf_access_type atype,
 		      u32 *next_btf_id, enum bpf_type_flag *flag);
 bool btf_struct_ids_match(struct bpf_verifier_log *log,
 			  const struct btf *btf, u32 id, int off,
@@ -2333,9 +2333,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id)
 }
 
 static inline int btf_struct_access(struct bpf_verifier_log *log,
-				    const struct btf *btf,
-				    const struct btf_type *t, int off, int size,
-				    enum bpf_access_type atype,
+				    const struct bpf_reg_state *reg,
+				    int off, int size, enum bpf_access_type atype,
 				    u32 *next_btf_id, enum bpf_type_flag *flag)
 {
 	return -EACCES;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index efc42a6e3aed..787d35dbf5b0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -568,10 +568,10 @@ struct sk_filter {
 DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 
 extern struct mutex nf_conn_btf_access_lock;
-extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf,
-				     const struct btf_type *t, int off, int size,
-				     enum bpf_access_type atype, u32 *next_btf_id,
-				     enum bpf_type_flag *flag);
+extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
+				     const struct bpf_reg_state *reg,
+				     int off, int size, enum bpf_access_type atype,
+				     u32 *next_btf_id, enum bpf_type_flag *flag);
 
 typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
 					  const struct bpf_insn *insnsi,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c0d73d71c539..875355ff3718 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6017,15 +6017,18 @@ error:
 	return -EINVAL;
 }
 
-int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
-		      const struct btf_type *t, int off, int size,
-		      enum bpf_access_type atype __maybe_unused,
+int btf_struct_access(struct bpf_verifier_log *log,
+		      const struct bpf_reg_state *reg,
+		      int off, int size, enum bpf_access_type atype __maybe_unused,
 		      u32 *next_btf_id, enum bpf_type_flag *flag)
 {
+	const struct btf *btf = reg->btf;
 	enum bpf_type_flag tmp_flag = 0;
+	const struct btf_type *t;
+	u32 id = reg->btf_id;
 	int err;
-	u32 id;
 
+	t = btf_type_by_id(btf, id);
 	do {
 		err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c588e5483540..5e74f460dfd0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4688,16 +4688,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	}
 
 	if (env->ops->btf_struct_access) {
-		ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
-						  off, size, atype, &btf_id, &flag);
+		ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
 	} else {
 		if (atype != BPF_READ) {
 			verbose(env, "only read is supported\n");
 			return -EACCES;
 		}
 
-		ret = btf_struct_access(&env->log, reg->btf, t, off, size,
-					atype, &btf_id, &flag);
+		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
 	}
 
 	if (ret < 0)
@@ -4723,6 +4721,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 {
 	struct bpf_reg_state *reg = regs + regno;
 	struct bpf_map *map = reg->map_ptr;
+	struct bpf_reg_state map_reg;
 	enum bpf_type_flag flag = 0;
 	const struct btf_type *t;
 	const char *tname;
@@ -4761,7 +4760,10 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag);
+	/* Simulate access to a PTR_TO_BTF_ID */
+	memset(&map_reg, 0, sizeof(map_reg));
+	mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
+	ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag);
 	if (ret < 0)
 		return ret;
 
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index e78dadfc5829..2d434c1f4617 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -156,29 +156,29 @@ static bool bpf_dummy_ops_is_valid_access(int off, int size,
 }
 
 static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
-					   const struct btf *btf,
-					   const struct btf_type *t, int off,
-					   int size, enum bpf_access_type atype,
+					   const struct bpf_reg_state *reg,
+					   int off, int size, enum bpf_access_type atype,
 					   u32 *next_btf_id,
 					   enum bpf_type_flag *flag)
 {
 	const struct btf_type *state;
+	const struct btf_type *t;
 	s32 type_id;
 	int err;
 
-	type_id = btf_find_by_name_kind(btf, "bpf_dummy_ops_state",
+	type_id = btf_find_by_name_kind(reg->btf, "bpf_dummy_ops_state",
 					BTF_KIND_STRUCT);
 	if (type_id < 0)
 		return -EINVAL;
 
-	state = btf_type_by_id(btf, type_id);
+	t = btf_type_by_id(reg->btf, reg->btf_id);
+	state = btf_type_by_id(reg->btf, type_id);
 	if (t != state) {
 		bpf_log(log, "only access to bpf_dummy_ops_state is supported\n");
 		return -EACCES;
 	}
 
-	err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
-				flag);
+	err = btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
 	if (err < 0)
 		return err;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 6dd2baf5eeb2..37fad5a9b752 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8651,28 +8651,25 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 DEFINE_MUTEX(nf_conn_btf_access_lock);
 EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
 
-int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf,
-			      const struct btf_type *t, int off, int size,
-			      enum bpf_access_type atype, u32 *next_btf_id,
-			      enum bpf_type_flag *flag);
+int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
+			      const struct bpf_reg_state *reg,
+			      int off, int size, enum bpf_access_type atype,
+			      u32 *next_btf_id, enum bpf_type_flag *flag);
 EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
 
 static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
-					const struct btf *btf,
-					const struct btf_type *t, int off,
-					int size, enum bpf_access_type atype,
-					u32 *next_btf_id,
-					enum bpf_type_flag *flag)
+					const struct bpf_reg_state *reg,
+					int off, int size, enum bpf_access_type atype,
+					u32 *next_btf_id, enum bpf_type_flag *flag)
 {
 	int ret = -EACCES;
 
 	if (atype == BPF_READ)
-		return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
-					 flag);
+		return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
 
 	mutex_lock(&nf_conn_btf_access_lock);
 	if (nfct_btf_struct_access)
-		ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag);
+		ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
 	mutex_unlock(&nf_conn_btf_access_lock);
 
 	return ret;
@@ -8738,21 +8735,18 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog,
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
 static int xdp_btf_struct_access(struct bpf_verifier_log *log,
-				 const struct btf *btf,
-				 const struct btf_type *t, int off,
-				 int size, enum bpf_access_type atype,
-				 u32 *next_btf_id,
-				 enum bpf_type_flag *flag)
+				 const struct bpf_reg_state *reg,
+				 int off, int size, enum bpf_access_type atype,
+				 u32 *next_btf_id, enum bpf_type_flag *flag)
 {
 	int ret = -EACCES;
 
 	if (atype == BPF_READ)
-		return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
-					 flag);
+		return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
 
 	mutex_lock(&nf_conn_btf_access_lock);
 	if (nfct_btf_struct_access)
-		ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag);
+		ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
 	mutex_unlock(&nf_conn_btf_access_lock);
 
 	return ret;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 6da16ae6a962..d15c91de995f 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -69,18 +69,17 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
 }
 
 static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
-					const struct btf *btf,
-					const struct btf_type *t, int off,
-					int size, enum bpf_access_type atype,
-					u32 *next_btf_id,
-					enum bpf_type_flag *flag)
+					const struct bpf_reg_state *reg,
+					int off, int size, enum bpf_access_type atype,
+					u32 *next_btf_id, enum bpf_type_flag *flag)
 {
+	const struct btf_type *t;
 	size_t end;
 
 	if (atype == BPF_READ)
-		return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
-					 flag);
+		return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
 
+	t = btf_type_by_id(reg->btf, reg->btf_id);
 	if (t != tcp_sock_type) {
 		bpf_log(log, "only read is supported\n");
 		return -EACCES;
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 8639e7efd0e2..24002bc61e07 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -191,19 +191,16 @@ BTF_ID(struct, nf_conn___init)
 
 /* Check writes into `struct nf_conn` */
 static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
-					   const struct btf *btf,
-					   const struct btf_type *t, int off,
-					   int size, enum bpf_access_type atype,
-					   u32 *next_btf_id,
-					   enum bpf_type_flag *flag)
+					   const struct bpf_reg_state *reg,
+					   int off, int size, enum bpf_access_type atype,
+					   u32 *next_btf_id, enum bpf_type_flag *flag)
 {
-	const struct btf_type *ncit;
-	const struct btf_type *nct;
+	const struct btf_type *ncit, *nct, *t;
 	size_t end;
 
-	ncit = btf_type_by_id(btf, btf_nf_conn_ids[1]);
-	nct = btf_type_by_id(btf, btf_nf_conn_ids[0]);
-
+	ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]);
+	nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]);
+	t = btf_type_by_id(reg->btf, reg->btf_id);
 	if (t != nct && t != ncit) {
 		bpf_log(log, "only read is supported\n");
 		return -EACCES;
-- 
cgit v1.2.3


From 14d898f3c1b3bf9c4375ee3255ec9e9b89a35578 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Tue, 8 Nov 2022 15:05:59 +0100
Subject: dev: Move received_rps counter next to RPS members in softnet data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the received_rps counter value next to the other RPS-related members
in softnet_data. This closes two four-byte holes in the structure, making
room for another pointer in the first two cache lines without bumping the
xmit struct to its own line.

Acked-by: Song Liu <song@kernel.org>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20221108140601.149971-2-toke@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 02a2318da7c7..ddc59ef98500 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3128,7 +3128,6 @@ struct softnet_data {
 	/* stats */
 	unsigned int		processed;
 	unsigned int		time_squeeze;
-	unsigned int		received_rps;
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
 #endif
@@ -3161,6 +3160,7 @@ struct softnet_data {
 	unsigned int		cpu;
 	unsigned int		input_queue_tail;
 #endif
+	unsigned int		received_rps;
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
-- 
cgit v1.2.3


From 32637e33003f36e75e9147788cc0e2f21706ef99 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Tue, 8 Nov 2022 15:06:00 +0100
Subject: bpf: Expand map key argument of bpf_redirect_map to u64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For queueing packets in XDP we want to add a new redirect map type with
support for 64-bit indexes. To prepare fore this, expand the width of the
'key' argument to the bpf_redirect_map() helper. Since BPF registers are
always 64-bit, this should be safe to do after the fact.

Acked-by: Song Liu <song@kernel.org>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20221108140601.149971-3-toke@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  2 +-
 include/linux/filter.h   | 12 ++++++------
 include/uapi/linux/bpf.h |  2 +-
 kernel/bpf/cpumap.c      |  4 ++--
 kernel/bpf/devmap.c      |  4 ++--
 kernel/bpf/verifier.c    |  2 +-
 net/core/filter.c        |  4 ++--
 net/xdp/xskmap.c         |  4 ++--
 8 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 49f9d2bec401..54462dd28824 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -135,7 +135,7 @@ struct bpf_map_ops {
 	struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
 
 	/* Misc helpers.*/
-	int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags);
+	int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);
 
 	/* map_meta_equal must be implemented for maps that can be
 	 * used as an inner map.  It is a runtime check to ensure
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 787d35dbf5b0..bf701976056e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -643,13 +643,13 @@ struct bpf_nh_params {
 };
 
 struct bpf_redirect_info {
-	u32 flags;
-	u32 tgt_index;
+	u64 tgt_index;
 	void *tgt_value;
 	struct bpf_map *map;
+	u32 flags;
+	u32 kern_flags;
 	u32 map_id;
 	enum bpf_map_type map_type;
-	u32 kern_flags;
 	struct bpf_nh_params nh;
 };
 
@@ -1504,7 +1504,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 }
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
-static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex,
+static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
 						  u64 flags, const u64 flag_mask,
 						  void *lookup_elem(struct bpf_map *map, u32 key))
 {
@@ -1515,7 +1515,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
 	if (unlikely(flags & ~(action_mask | flag_mask)))
 		return XDP_ABORTED;
 
-	ri->tgt_value = lookup_elem(map, ifindex);
+	ri->tgt_value = lookup_elem(map, index);
 	if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
 		/* If the lookup fails we want to clear out the state in the
 		 * redirect_info struct completely, so that if an eBPF program
@@ -1527,7 +1527,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
 		return flags & action_mask;
 	}
 
-	ri->tgt_index = ifindex;
+	ri->tgt_index = index;
 	ri->map_id = map->id;
 	ri->map_type = map->map_type;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6580448e9f77..ab86145df760 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2647,7 +2647,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags)
  * 	Description
  * 		Redirect the packet to the endpoint referenced by *map* at
  * 		index *key*. Depending on its type, this *map* can contain
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 6b6a78c04b90..e0b2d016f0bf 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -667,9 +667,9 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
-static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
 {
-	return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+	return __bpf_xdp_redirect_map(map, index, flags, 0,
 				      __cpu_map_lookup_elem);
 }
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f9a87dcc5535..d01e4c55b376 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -992,14 +992,14 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
 					 map, key, value, map_flags);
 }
 
-static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, ifindex, flags,
 				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
 				      __dev_map_lookup_elem);
 }
 
-static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, ifindex, flags,
 				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5e74f460dfd0..be24774961ab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14384,7 +14384,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
 				     (int (*)(struct bpf_map *map, void *value))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_redirect,
-				     (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+				     (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
 				     (int (*)(struct bpf_map *map,
 					      bpf_callback_t callback_fn,
diff --git a/net/core/filter.c b/net/core/filter.c
index 37fad5a9b752..754dd01354d8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4414,10 +4414,10 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
+BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key,
 	   u64, flags)
 {
-	return map->ops->map_redirect(map, ifindex, flags);
+	return map->ops->map_redirect(map, key, flags);
 }
 
 static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index acc8e52a4f5f..771d0fa90ef5 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -231,9 +231,9 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
 {
-	return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+	return __bpf_xdp_redirect_map(map, index, flags, 0,
 				      __xsk_map_lookup_elem);
 }
 
-- 
cgit v1.2.3


From 06463f6e98df34908c26aa8e7a31a279646b1f51 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 9 Nov 2022 14:42:49 -0800
Subject: wifi: wl1251: drop support for platform data

Remove support for configuring the device via platform data because
there are no users of wl1251_platform_data left in the mainline kernel.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20221109224250.2885119-2-dmitry.torokhov@gmail.com
---
 MAINTAINERS                                    |  1 -
 drivers/net/wireless/ti/Kconfig                |  8 -----
 drivers/net/wireless/ti/wilink_platform_data.c | 35 --------------------
 drivers/net/wireless/ti/wl1251/sdio.c          |  8 +----
 drivers/net/wireless/ti/wl1251/spi.c           | 15 ++-------
 drivers/net/wireless/ti/wlcore/spi.c           |  1 -
 include/linux/wl12xx.h                         | 44 --------------------------
 7 files changed, 4 insertions(+), 108 deletions(-)
 delete mode 100644 drivers/net/wireless/ti/wilink_platform_data.c
 delete mode 100644 include/linux/wl12xx.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 377aedb3808b..56764f76752b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20665,7 +20665,6 @@ W:	https://wireless.wiki.kernel.org/en/users/Drivers/wl12xx
 W:	https://wireless.wiki.kernel.org/en/users/Drivers/wl1251
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/luca/wl12xx.git
 F:	drivers/net/wireless/ti/
-F:	include/linux/wl12xx.h
 
 TIMEKEEPING, CLOCKSOURCE CORE, NTP, ALARMTIMER
 M:	John Stultz <jstultz@google.com>
diff --git a/drivers/net/wireless/ti/Kconfig b/drivers/net/wireless/ti/Kconfig
index 7c0b17a76fe2..3fcd9e395f72 100644
--- a/drivers/net/wireless/ti/Kconfig
+++ b/drivers/net/wireless/ti/Kconfig
@@ -18,12 +18,4 @@ source "drivers/net/wireless/ti/wl18xx/Kconfig"
 # keep last for automatic dependencies
 source "drivers/net/wireless/ti/wlcore/Kconfig"
 
-config WILINK_PLATFORM_DATA
-	bool "TI WiLink platform data"
-	depends on WLCORE_SDIO || WL1251_SDIO
-	default y
-	help
-	Small platform data bit needed to pass data to the sdio modules.
-
-
 endif # WLAN_VENDOR_TI
diff --git a/drivers/net/wireless/ti/wilink_platform_data.c b/drivers/net/wireless/ti/wilink_platform_data.c
deleted file mode 100644
index 1de6a62d526f..000000000000
--- a/drivers/net/wireless/ti/wilink_platform_data.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * This file is part of wl12xx
- *
- * Copyright (C) 2010-2011 Texas Instruments, Inc.
- */
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/wl12xx.h>
-
-static struct wl1251_platform_data *wl1251_platform_data;
-
-int __init wl1251_set_platform_data(const struct wl1251_platform_data *data)
-{
-	if (wl1251_platform_data)
-		return -EBUSY;
-	if (!data)
-		return -EINVAL;
-
-	wl1251_platform_data = kmemdup(data, sizeof(*data), GFP_KERNEL);
-	if (!wl1251_platform_data)
-		return -ENOMEM;
-
-	return 0;
-}
-
-struct wl1251_platform_data *wl1251_get_platform_data(void)
-{
-	if (!wl1251_platform_data)
-		return ERR_PTR(-ENODEV);
-
-	return wl1251_platform_data;
-}
-EXPORT_SYMBOL(wl1251_get_platform_data);
diff --git a/drivers/net/wireless/ti/wl1251/sdio.c b/drivers/net/wireless/ti/wl1251/sdio.c
index c9a4e9a43400..301bd0043a43 100644
--- a/drivers/net/wireless/ti/wl1251/sdio.c
+++ b/drivers/net/wireless/ti/wl1251/sdio.c
@@ -12,7 +12,6 @@
 #include <linux/mmc/sdio_func.h>
 #include <linux/mmc/sdio_ids.h>
 #include <linux/platform_device.h>
-#include <linux/wl12xx.h>
 #include <linux/irq.h>
 #include <linux/pm_runtime.h>
 #include <linux/of.h>
@@ -197,7 +196,6 @@ static int wl1251_sdio_probe(struct sdio_func *func,
 	struct wl1251 *wl;
 	struct ieee80211_hw *hw;
 	struct wl1251_sdio *wl_sdio;
-	const struct wl1251_platform_data *wl1251_board_data;
 	struct device_node *np = func->dev.of_node;
 
 	hw = wl1251_alloc_hw();
@@ -225,11 +223,7 @@ static int wl1251_sdio_probe(struct sdio_func *func,
 	wl->if_priv = wl_sdio;
 	wl->if_ops = &wl1251_sdio_ops;
 
-	wl1251_board_data = wl1251_get_platform_data();
-	if (!IS_ERR(wl1251_board_data)) {
-		wl->irq = wl1251_board_data->irq;
-		wl->use_eeprom = wl1251_board_data->use_eeprom;
-	} else if (np) {
+	if (np) {
 		wl->use_eeprom = of_property_read_bool(np, "ti,wl1251-has-eeprom");
 		wl->irq = of_irq_get(np, 0);
 		if (wl->irq == -EPROBE_DEFER) {
diff --git a/drivers/net/wireless/ti/wl1251/spi.c b/drivers/net/wireless/ti/wl1251/spi.c
index 9df38726e8b0..08d9814b49c1 100644
--- a/drivers/net/wireless/ti/wl1251/spi.c
+++ b/drivers/net/wireless/ti/wl1251/spi.c
@@ -12,7 +12,6 @@
 #include <linux/swab.h>
 #include <linux/crc7.h>
 #include <linux/spi/spi.h>
-#include <linux/wl12xx.h>
 #include <linux/gpio.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
@@ -226,16 +225,13 @@ static const struct wl1251_if_operations wl1251_spi_ops = {
 
 static int wl1251_spi_probe(struct spi_device *spi)
 {
-	struct wl1251_platform_data *pdata = dev_get_platdata(&spi->dev);
 	struct device_node *np = spi->dev.of_node;
 	struct ieee80211_hw *hw;
 	struct wl1251 *wl;
 	int ret;
 
-	if (!np && !pdata) {
-		wl1251_error("no platform data");
+	if (!np)
 		return -ENODEV;
-	}
 
 	hw = wl1251_alloc_hw();
 	if (IS_ERR(hw))
@@ -259,14 +255,9 @@ static int wl1251_spi_probe(struct spi_device *spi)
 		goto out_free;
 	}
 
-	if (np) {
-		wl->use_eeprom = of_property_read_bool(np, "ti,wl1251-has-eeprom");
-		wl->power_gpio = of_get_named_gpio(np, "ti,power-gpio", 0);
-	} else if (pdata) {
-		wl->power_gpio = pdata->power_gpio;
-		wl->use_eeprom = pdata->use_eeprom;
-	}
+	wl->use_eeprom = of_property_read_bool(np, "ti,wl1251-has-eeprom");
 
+	wl->power_gpio = of_get_named_gpio(np, "ti,power-gpio", 0);
 	if (wl->power_gpio == -EPROBE_DEFER) {
 		ret = -EPROBE_DEFER;
 		goto out_free;
diff --git a/drivers/net/wireless/ti/wlcore/spi.c b/drivers/net/wireless/ti/wlcore/spi.c
index 7eae1ec2eb2b..2d2edddc77bd 100644
--- a/drivers/net/wireless/ti/wlcore/spi.c
+++ b/drivers/net/wireless/ti/wlcore/spi.c
@@ -14,7 +14,6 @@
 #include <linux/swab.h>
 #include <linux/crc7.h>
 #include <linux/spi/spi.h>
-#include <linux/wl12xx.h>
 #include <linux/platform_device.h>
 #include <linux/of_irq.h>
 #include <linux/regulator/consumer.h>
diff --git a/include/linux/wl12xx.h b/include/linux/wl12xx.h
deleted file mode 100644
index 03d61f1d23ab..000000000000
--- a/include/linux/wl12xx.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * This file is part of wl12xx
- *
- * Copyright (C) 2009 Nokia Corporation
- *
- * Contact: Luciano Coelho <luciano.coelho@nokia.com>
- */
-
-#ifndef _LINUX_WL12XX_H
-#define _LINUX_WL12XX_H
-
-#include <linux/err.h>
-
-struct wl1251_platform_data {
-	int power_gpio;
-	/* SDIO only: IRQ number if WLAN_IRQ line is used, 0 for SDIO IRQs */
-	int irq;
-	bool use_eeprom;
-};
-
-#ifdef CONFIG_WILINK_PLATFORM_DATA
-
-int wl1251_set_platform_data(const struct wl1251_platform_data *data);
-
-struct wl1251_platform_data *wl1251_get_platform_data(void);
-
-#else
-
-static inline
-int wl1251_set_platform_data(const struct wl1251_platform_data *data)
-{
-	return -ENOSYS;
-}
-
-static inline
-struct wl1251_platform_data *wl1251_get_platform_data(void)
-{
-	return ERR_PTR(-ENODATA);
-}
-
-#endif
-
-#endif
-- 
cgit v1.2.3


From 9804985bf27f8fbcf0d96c7435b5ad94a2a6ea20 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 14 Nov 2022 13:57:57 -0800
Subject: udp: Introduce optional per-netns hash table.

The maximum hash table size is 64K due to the nature of the protocol. [0]
It's smaller than TCP, and fewer sockets can cause a performance drop.

On an EC2 c5.24xlarge instance (192 GiB memory), after running iperf3 in
different netns, creating 32Mi sockets without data transfer in the root
netns causes regression for the iperf3's connection.

  uhash_entries		sockets		length		Gbps
	    64K		      1		     1		5.69
			    1Mi		    16		5.27
			    2Mi		    32		4.90
			    4Mi		    64		4.09
			    8Mi		   128		2.96
			   16Mi		   256		2.06
			   32Mi		   512		1.12

The per-netns hash table breaks the lengthy lists into shorter ones.  It is
useful on a multi-tenant system with thousands of netns.  With smaller hash
tables, we can look up sockets faster, isolate noisy neighbours, and reduce
lock contention.

The max size of the per-netns table is 64K as well.  This is because the
possible hash range by udp_hashfn() always fits in 64K within the same
netns and we cannot make full use of the whole buckets larger than 64K.

  /* 0 < num < 64K  ->  X < hash < X + 64K */
  (num + net_hash_mix(net)) & mask;

Also, the min size is 128.  We use a bitmap to search for an available
port in udp_lib_get_port().  To keep the bitmap on the stack and not
fire the CONFIG_FRAME_WARN error at build time, we round up the table
size to 128.

The sysctl usage is the same with TCP:

  $ dmesg | cut -d ' ' -f 6- | grep "UDP hash"
  UDP hash table entries: 65536 (order: 9, 2097152 bytes, vmalloc)

  # sysctl net.ipv4.udp_hash_entries
  net.ipv4.udp_hash_entries = 65536  # can be changed by uhash_entries

  # sysctl net.ipv4.udp_child_hash_entries
  net.ipv4.udp_child_hash_entries = 0  # disabled by default

  # ip netns add test1
  # ip netns exec test1 sysctl net.ipv4.udp_hash_entries
  net.ipv4.udp_hash_entries = -65536  # share the global table

  # sysctl -w net.ipv4.udp_child_hash_entries=100
  net.ipv4.udp_child_hash_entries = 100

  # ip netns add test2
  # ip netns exec test2 sysctl net.ipv4.udp_hash_entries
  net.ipv4.udp_hash_entries = 128  # own a per-netns table with 2^n buckets

We could optimise the hash table lookup/iteration further by removing
the netns comparison for the per-netns one in the future.  Also, we
could optimise the sparse udp_hslot layout by putting it in udp_table.

[0]: https://lore.kernel.org/netdev/4ACC2815.7010101@gmail.com/

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.rst |  27 +++++++++
 include/linux/udp.h                    |   2 +
 include/net/netns/ipv4.h               |   2 +
 net/ipv4/sysctl_net_ipv4.c             |  40 +++++++++++++
 net/ipv4/udp.c                         | 101 +++++++++++++++++++++++++++++++--
 5 files changed, 166 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 815efc89ad73..727b25cc7ec4 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1177,6 +1177,33 @@ udp_rmem_min - INTEGER
 udp_wmem_min - INTEGER
 	UDP does not have tx memory accounting and this tunable has no effect.
 
+udp_hash_entries - INTEGER
+	Show the number of hash buckets for UDP sockets in the current
+	networking namespace.
+
+	A negative value means the networking namespace does not own its
+	hash buckets and shares the initial networking namespace's one.
+
+udp_child_ehash_entries - INTEGER
+	Control the number of hash buckets for UDP sockets in the child
+	networking namespace, which must be set before clone() or unshare().
+
+	If the value is not 0, the kernel uses a value rounded up to 2^n
+	as the actual hash bucket size.  0 is a special value, meaning
+	the child networking namespace will share the initial networking
+	namespace's hash buckets.
+
+	Note that the child will use the global one in case the kernel
+	fails to allocate enough memory.  In addition, the global hash
+	buckets are spread over available NUMA nodes, but the allocation
+	of the child hash table depends on the current process's NUMA
+	policy, which could result in performance differences.
+
+	Possible values: 0, 2^n (n: 7 (128) - 16 (64K))
+
+	Default: 0
+
+
 RAW variables
 =============
 
diff --git a/include/linux/udp.h b/include/linux/udp.h
index dea57aa37df6..a2892e151644 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -23,7 +23,9 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
 	return (struct udphdr *)skb_transport_header(skb);
 }
 
+#define UDP_HTABLE_SIZE_MIN_PERNET	128
 #define UDP_HTABLE_SIZE_MIN		(CONFIG_BASE_SMALL ? 128 : 256)
+#define UDP_HTABLE_SIZE_MAX		65536
 
 static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 {
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e4cc4d3cacc4..db762e35aca9 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -208,6 +208,8 @@ struct netns_ipv4 {
 
 	atomic_t dev_addr_genid;
 
+	unsigned int sysctl_udp_child_hash_entries;
+
 #ifdef CONFIG_SYSCTL
 	unsigned long *sysctl_local_reserved_ports;
 	int sysctl_ip_prot_sock;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0af28cedd071..0d0cc4ef2b85 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -40,6 +40,7 @@ static int one_day_secs = 24 * 3600;
 static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
 	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
 static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
+static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
 static int tcp_plb_max_rounds = 31;
 static int tcp_plb_max_cong_thresh = 256;
 
@@ -402,12 +403,36 @@ static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
 	if (!net_eq(net, &init_net) && !hinfo->pernet)
 		tcp_ehash_entries *= -1;
 
+	memset(&tbl, 0, sizeof(tbl));
 	tbl.data = &tcp_ehash_entries;
 	tbl.maxlen = sizeof(int);
 
 	return proc_dointvec(&tbl, write, buffer, lenp, ppos);
 }
 
+static int proc_udp_hash_entries(struct ctl_table *table, int write,
+				 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+				       ipv4.sysctl_udp_child_hash_entries);
+	int udp_hash_entries;
+	struct ctl_table tbl;
+
+	udp_hash_entries = net->ipv4.udp_table->mask + 1;
+
+	/* A negative number indicates that the child netns
+	 * shares the global udp_table.
+	 */
+	if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
+		udp_hash_entries *= -1;
+
+	memset(&tbl, 0, sizeof(tbl));
+	tbl.data = &udp_hash_entries;
+	tbl.maxlen = sizeof(int);
+
+	return proc_dointvec(&tbl, write, buffer, lenp, ppos);
+}
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
 					  void *buffer, size_t *lenp,
@@ -1361,6 +1386,21 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &tcp_child_ehash_entries_max,
 	},
+	{
+		.procname	= "udp_hash_entries",
+		.data		= &init_net.ipv4.sysctl_udp_child_hash_entries,
+		.mode		= 0444,
+		.proc_handler	= proc_udp_hash_entries,
+	},
+	{
+		.procname	= "udp_child_hash_entries",
+		.data		= &init_net.ipv4.sysctl_udp_child_hash_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &udp_child_hash_entries_max,
+	},
 	{
 		.procname	= "udp_rmem_min",
 		.data		= &init_net.ipv4.sysctl_udp_rmem_min,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 37e79158d145..1fb7d1ed1cb1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -129,7 +129,7 @@ DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
 EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
 
 #define MAX_UDP_PORTS 65536
-#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)
 
 static struct udp_table *udp_get_table_prot(struct sock *sk)
 {
@@ -3277,7 +3277,7 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 					      &table->log,
 					      &table->mask,
 					      UDP_HTABLE_SIZE_MIN,
-					      64 * 1024);
+					      UDP_HTABLE_SIZE_MAX);
 
 	table->hash2 = table->hash + (table->mask + 1);
 	for (i = 0; i <= table->mask; i++) {
@@ -3302,22 +3302,111 @@ u32 udp_flow_hashrnd(void)
 }
 EXPORT_SYMBOL(udp_flow_hashrnd);
 
-static int __net_init udp_sysctl_init(struct net *net)
+static void __net_init udp_sysctl_init(struct net *net)
 {
-	net->ipv4.udp_table = &udp_table;
-
 	net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
 	net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	net->ipv4.sysctl_udp_l3mdev_accept = 0;
 #endif
+}
+
+static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
+{
+	struct udp_table *udptable;
+	int i;
+
+	udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
+	if (!udptable)
+		goto out;
+
+	udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot),
+				      GFP_KERNEL_ACCOUNT);
+	if (!udptable->hash)
+		goto free_table;
+
+	udptable->hash2 = udptable->hash + hash_entries;
+	udptable->mask = hash_entries - 1;
+	udptable->log = ilog2(hash_entries);
+
+	for (i = 0; i < hash_entries; i++) {
+		INIT_HLIST_HEAD(&udptable->hash[i].head);
+		udptable->hash[i].count = 0;
+		spin_lock_init(&udptable->hash[i].lock);
+
+		INIT_HLIST_HEAD(&udptable->hash2[i].head);
+		udptable->hash2[i].count = 0;
+		spin_lock_init(&udptable->hash2[i].lock);
+	}
+
+	return udptable;
+
+free_table:
+	kfree(udptable);
+out:
+	return NULL;
+}
+
+static void __net_exit udp_pernet_table_free(struct net *net)
+{
+	struct udp_table *udptable = net->ipv4.udp_table;
+
+	if (udptable == &udp_table)
+		return;
+
+	kvfree(udptable->hash);
+	kfree(udptable);
+}
+
+static void __net_init udp_set_table(struct net *net)
+{
+	struct udp_table *udptable;
+	unsigned int hash_entries;
+	struct net *old_net;
+
+	if (net_eq(net, &init_net))
+		goto fallback;
+
+	old_net = current->nsproxy->net_ns;
+	hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
+	if (!hash_entries)
+		goto fallback;
+
+	/* Set min to keep the bitmap on stack in udp_lib_get_port() */
+	if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
+		hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
+	else
+		hash_entries = roundup_pow_of_two(hash_entries);
+
+	udptable = udp_pernet_table_alloc(hash_entries);
+	if (udptable) {
+		net->ipv4.udp_table = udptable;
+	} else {
+		pr_warn("Failed to allocate UDP hash table (entries: %u) "
+			"for a netns, fallback to the global one\n",
+			hash_entries);
+fallback:
+		net->ipv4.udp_table = &udp_table;
+	}
+}
+
+static int __net_init udp_pernet_init(struct net *net)
+{
+	udp_sysctl_init(net);
+	udp_set_table(net);
 
 	return 0;
 }
 
+static void __net_exit udp_pernet_exit(struct net *net)
+{
+	udp_pernet_table_free(net);
+}
+
 static struct pernet_operations __net_initdata udp_sysctl_ops = {
-	.init	= udp_sysctl_init,
+	.init	= udp_pernet_init,
+	.exit	= udp_pernet_exit,
 };
 
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
-- 
cgit v1.2.3


From 6c1c5097781f563b70a81683ea6fdac21637573b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Nov 2022 08:53:55 +0000
Subject: net: add atomic_long_t to net_device_stats fields

Long standing KCSAN issues are caused by data-race around
some dev->stats changes.

Most performance critical paths already use per-cpu
variables, or per-queue ones.

It is reasonable (and more correct) to use atomic operations
for the slow paths.

This patch adds an union for each field of net_device_stats,
so that we can convert paths that are not yet protected
by a spinlock or a mutex.

netdev_stats_to_stats64() no longer has an #if BITS_PER_LONG==64

Note that the memcpy() we were using on 64bit arches
had no provision to avoid load-tearing,
while atomic_long_read() is providing the needed protection
at no cost.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 58 ++++++++++++++++++++++++++++-------------------
 include/net/dst.h         |  5 ++--
 net/core/dev.c            | 14 +++---------
 3 files changed, 40 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 02a2318da7c7..23b3903b0678 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -171,31 +171,38 @@ static inline bool dev_xmit_complete(int rc)
  *	(unsigned long) so they can be read and written atomically.
  */
 
+#define NET_DEV_STAT(FIELD)			\
+	union {					\
+		unsigned long FIELD;		\
+		atomic_long_t __##FIELD;	\
+	}
+
 struct net_device_stats {
-	unsigned long	rx_packets;
-	unsigned long	tx_packets;
-	unsigned long	rx_bytes;
-	unsigned long	tx_bytes;
-	unsigned long	rx_errors;
-	unsigned long	tx_errors;
-	unsigned long	rx_dropped;
-	unsigned long	tx_dropped;
-	unsigned long	multicast;
-	unsigned long	collisions;
-	unsigned long	rx_length_errors;
-	unsigned long	rx_over_errors;
-	unsigned long	rx_crc_errors;
-	unsigned long	rx_frame_errors;
-	unsigned long	rx_fifo_errors;
-	unsigned long	rx_missed_errors;
-	unsigned long	tx_aborted_errors;
-	unsigned long	tx_carrier_errors;
-	unsigned long	tx_fifo_errors;
-	unsigned long	tx_heartbeat_errors;
-	unsigned long	tx_window_errors;
-	unsigned long	rx_compressed;
-	unsigned long	tx_compressed;
+	NET_DEV_STAT(rx_packets);
+	NET_DEV_STAT(tx_packets);
+	NET_DEV_STAT(rx_bytes);
+	NET_DEV_STAT(tx_bytes);
+	NET_DEV_STAT(rx_errors);
+	NET_DEV_STAT(tx_errors);
+	NET_DEV_STAT(rx_dropped);
+	NET_DEV_STAT(tx_dropped);
+	NET_DEV_STAT(multicast);
+	NET_DEV_STAT(collisions);
+	NET_DEV_STAT(rx_length_errors);
+	NET_DEV_STAT(rx_over_errors);
+	NET_DEV_STAT(rx_crc_errors);
+	NET_DEV_STAT(rx_frame_errors);
+	NET_DEV_STAT(rx_fifo_errors);
+	NET_DEV_STAT(rx_missed_errors);
+	NET_DEV_STAT(tx_aborted_errors);
+	NET_DEV_STAT(tx_carrier_errors);
+	NET_DEV_STAT(tx_fifo_errors);
+	NET_DEV_STAT(tx_heartbeat_errors);
+	NET_DEV_STAT(tx_window_errors);
+	NET_DEV_STAT(rx_compressed);
+	NET_DEV_STAT(tx_compressed);
 };
+#undef NET_DEV_STAT
 
 /* per-cpu stats, allocated on demand.
  * Try to fit them in a single cache line, for dev_get_stats() sake.
@@ -5171,4 +5178,9 @@ extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 
 extern struct net_device *blackhole_netdev;
 
+/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */
+#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD)
+#define DEV_STATS_ADD(DEV, FIELD, VAL) 	\
+		atomic_long_add((VAL), &(DEV)->stats.__##FIELD)
+
 #endif	/* _LINUX_NETDEVICE_H */
diff --git a/include/net/dst.h b/include/net/dst.h
index 00b479ce6b99..d67fda89cd0f 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -356,9 +356,8 @@ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
 static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
 				 struct net *net)
 {
-	/* TODO : stats should be SMP safe */
-	dev->stats.rx_packets++;
-	dev->stats.rx_bytes += skb->len;
+	DEV_STATS_INC(dev, rx_packets);
+	DEV_STATS_ADD(dev, rx_bytes, skb->len);
 	__skb_tunnel_rx(skb, dev, net);
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index fb943dad9651..d0fb4af9a126 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10369,24 +10369,16 @@ void netdev_run_todo(void)
 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 			     const struct net_device_stats *netdev_stats)
 {
-#if BITS_PER_LONG == 64
-	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
-	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
-	/* zero out counters that only exist in rtnl_link_stats64 */
-	memset((char *)stats64 + sizeof(*netdev_stats), 0,
-	       sizeof(*stats64) - sizeof(*netdev_stats));
-#else
-	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
-	const unsigned long *src = (const unsigned long *)netdev_stats;
+	size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
+	const atomic_long_t *src = (atomic_long_t *)netdev_stats;
 	u64 *dst = (u64 *)stats64;
 
 	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
 	for (i = 0; i < n; i++)
-		dst[i] = src[i];
+		dst[i] = atomic_long_read(&src[i]);
 	/* zero out counters that only exist in rtnl_link_stats64 */
 	memset((char *)stats64 + n * sizeof(u64), 0,
 	       sizeof(*stats64) - n * sizeof(u64));
-#endif
 }
 EXPORT_SYMBOL(netdev_stats_to_stats64);
 
-- 
cgit v1.2.3


From 3af43ba4c6019b29c048921eb8147eb010165329 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 16 Nov 2022 15:50:58 +0800
Subject: bpf: Pass map file to .map_update_batch directly

Currently bpf_map_do_batch() first invokes fdget(batch.map_fd) to get
the target map file, then it invokes generic_map_update_batch() to do
batch update. generic_map_update_batch() will get the target map file
by using fdget(batch.map_fd) again and pass it to bpf_map_update_value().

The problem is map file returned by the second fdget() may be NULL or a
totally different file compared by map file in bpf_map_do_batch(). The
reason is that the first fdget() only guarantees the liveness of struct
file instead of file descriptor and the file description may be released
by concurrent close() through pick_file().

It doesn't incur any problem as for now, because maps with batch update
support don't use map file in .map_fd_get_ptr() ops. But it is better to
fix the potential access of an invalid map file.

Using __bpf_map_get() again in generic_map_update_batch() can not fix
the problem, because batch.map_fd may be closed and reopened, and the
returned map file may be different with map file got in
bpf_map_do_batch(), so just passing the map file directly to
.map_update_batch() in bpf_map_do_batch().

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20221116075059.1551277-1-houtao@huaweicloud.com
---
 include/linux/bpf.h  |  5 +++--
 kernel/bpf/syscall.c | 32 ++++++++++++++------------------
 2 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 54462dd28824..e60a5c052473 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -85,7 +85,8 @@ struct bpf_map_ops {
 	int (*map_lookup_and_delete_batch)(struct bpf_map *map,
 					   const union bpf_attr *attr,
 					   union bpf_attr __user *uattr);
-	int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr,
+	int (*map_update_batch)(struct bpf_map *map, struct file *map_file,
+				const union bpf_attr *attr,
 				union bpf_attr __user *uattr);
 	int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,
 				union bpf_attr __user *uattr);
@@ -1789,7 +1790,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 int  generic_map_lookup_batch(struct bpf_map *map,
 			      const union bpf_attr *attr,
 			      union bpf_attr __user *uattr);
-int  generic_map_update_batch(struct bpf_map *map,
+int  generic_map_update_batch(struct bpf_map *map, struct file *map_file,
 			      const union bpf_attr *attr,
 			      union bpf_attr __user *uattr);
 int  generic_map_delete_batch(struct bpf_map *map,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fdbae52f463f..b078965999e6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -175,8 +175,8 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
 		synchronize_rcu();
 }
 
-static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
-				void *value, __u64 flags)
+static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
+				void *key, void *value, __u64 flags)
 {
 	int err;
 
@@ -190,7 +190,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
 		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
 		return sock_map_update_elem_sys(map, key, value, flags);
 	} else if (IS_FD_PROG_ARRAY(map)) {
-		return bpf_fd_array_map_update_elem(map, f.file, key, value,
+		return bpf_fd_array_map_update_elem(map, map_file, key, value,
 						    flags);
 	}
 
@@ -205,12 +205,12 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
 						       flags);
 	} else if (IS_FD_ARRAY(map)) {
 		rcu_read_lock();
-		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+		err = bpf_fd_array_map_update_elem(map, map_file, key, value,
 						   flags);
 		rcu_read_unlock();
 	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
 		rcu_read_lock();
-		err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+		err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
 						  flags);
 		rcu_read_unlock();
 	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
@@ -1410,7 +1410,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 		goto free_key;
 	}
 
-	err = bpf_map_update_value(map, f, key, value, attr->flags);
+	err = bpf_map_update_value(map, f.file, key, value, attr->flags);
 
 	kvfree(value);
 free_key:
@@ -1596,16 +1596,14 @@ int generic_map_delete_batch(struct bpf_map *map,
 	return err;
 }
 
-int generic_map_update_batch(struct bpf_map *map,
+int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
 			     const union bpf_attr *attr,
 			     union bpf_attr __user *uattr)
 {
 	void __user *values = u64_to_user_ptr(attr->batch.values);
 	void __user *keys = u64_to_user_ptr(attr->batch.keys);
 	u32 value_size, cp, max_count;
-	int ufd = attr->batch.map_fd;
 	void *key, *value;
-	struct fd f;
 	int err = 0;
 
 	if (attr->batch.elem_flags & ~BPF_F_LOCK)
@@ -1632,7 +1630,6 @@ int generic_map_update_batch(struct bpf_map *map,
 		return -ENOMEM;
 	}
 
-	f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
 	for (cp = 0; cp < max_count; cp++) {
 		err = -EFAULT;
 		if (copy_from_user(key, keys + cp * map->key_size,
@@ -1640,7 +1637,7 @@ int generic_map_update_batch(struct bpf_map *map,
 		    copy_from_user(value, values + cp * value_size, value_size))
 			break;
 
-		err = bpf_map_update_value(map, f, key, value,
+		err = bpf_map_update_value(map, map_file, key, value,
 					   attr->batch.elem_flags);
 
 		if (err)
@@ -1653,7 +1650,6 @@ int generic_map_update_batch(struct bpf_map *map,
 
 	kvfree(value);
 	kvfree(key);
-	fdput(f);
 	return err;
 }
 
@@ -4446,13 +4442,13 @@ put_file:
 
 #define BPF_MAP_BATCH_LAST_FIELD batch.flags
 
-#define BPF_DO_BATCH(fn)			\
+#define BPF_DO_BATCH(fn, ...)			\
 	do {					\
 		if (!fn) {			\
 			err = -ENOTSUPP;	\
 			goto err_put;		\
 		}				\
-		err = fn(map, attr, uattr);	\
+		err = fn(__VA_ARGS__);		\
 	} while (0)
 
 static int bpf_map_do_batch(const union bpf_attr *attr,
@@ -4486,13 +4482,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
 	}
 
 	if (cmd == BPF_MAP_LOOKUP_BATCH)
-		BPF_DO_BATCH(map->ops->map_lookup_batch);
+		BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
 	else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
-		BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+		BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
 	else if (cmd == BPF_MAP_UPDATE_BATCH)
-		BPF_DO_BATCH(map->ops->map_update_batch);
+		BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
 	else
-		BPF_DO_BATCH(map->ops->map_delete_batch);
+		BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
 err_put:
 	if (has_write)
 		bpf_map_write_active_dec(map);
-- 
cgit v1.2.3


From 282de143ead96a5d53331e946f31c977b4610a74 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 18 Nov 2022 07:25:55 +0530
Subject: bpf: Introduce allocated objects support

Introduce support for representing pointers to objects allocated by the
BPF program, i.e. PTR_TO_BTF_ID that point to a type in program BTF.
This is indicated by the presence of MEM_ALLOC type flag in reg->type to
avoid having to check btf_is_kernel when trying to match argument types
in helpers.

Whenever walking such types, any pointers being walked will always yield
a SCALAR instead of pointer. In the future we might permit kptr inside
such allocated objects (either kernel or program allocated), and it will
then form a PTR_TO_BTF_ID of the respective type.

For now, such allocated objects will always be referenced in verifier
context, hence ref_obj_id == 0 for them is a bug. It is allowed to write
to such objects, as long fields that are special are not touched
(support for which will be added in subsequent patches). Note that once
such a pointer is marked PTR_UNTRUSTED, it is no longer allowed to write
to it.

No PROBE_MEM handling is therefore done for loads into this type unless
PTR_UNTRUSTED is part of the register type, since they can never be in
an undefined state, and their lifetime will always be valid.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221118015614.2013203-6-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   | 11 +++++++++++
 kernel/bpf/btf.c      |  5 +++++
 kernel/bpf/verifier.c | 25 +++++++++++++++++++++++--
 3 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e60a5c052473..7440c20c4192 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -525,6 +525,11 @@ enum bpf_type_flag {
 	/* Size is known at compile time. */
 	MEM_FIXED_SIZE		= BIT(10 + BPF_BASE_TYPE_BITS),
 
+	/* MEM is of an allocated object of type in program BTF. This is used to
+	 * tag PTR_TO_BTF_ID allocated using bpf_obj_new.
+	 */
+	MEM_ALLOC		= BIT(11 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
@@ -2792,4 +2797,10 @@ struct bpf_key {
 	bool has_ref;
 };
 #endif /* CONFIG_KEYS */
+
+static inline bool type_is_alloc(u32 type)
+{
+	return type & MEM_ALLOC;
+}
+
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 875355ff3718..9a596f430558 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6034,6 +6034,11 @@ int btf_struct_access(struct bpf_verifier_log *log,
 
 		switch (err) {
 		case WALK_PTR:
+			/* For local types, the destination register cannot
+			 * become a pointer again.
+			 */
+			if (type_is_alloc(reg->type))
+				return SCALAR_VALUE;
 			/* If we found the pointer or scalar on t+off,
 			 * we're done.
 			 */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0312d9ce292f..49e08c1c2c61 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4687,14 +4687,27 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	if (env->ops->btf_struct_access) {
+	if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) {
+		if (!btf_is_kernel(reg->btf)) {
+			verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
+			return -EFAULT;
+		}
 		ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
 	} else {
-		if (atype != BPF_READ) {
+		/* Writes are permitted with default btf_struct_access for
+		 * program allocated objects (which always have ref_obj_id > 0),
+		 * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
+		 */
+		if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
 			verbose(env, "only read is supported\n");
 			return -EACCES;
 		}
 
+		if (type_is_alloc(reg->type) && !reg->ref_obj_id) {
+			verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
+			return -EFAULT;
+		}
+
 		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
 	}
 
@@ -5973,6 +5986,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	 * fixed offset.
 	 */
 	case PTR_TO_BTF_ID:
+	case PTR_TO_BTF_ID | MEM_ALLOC:
 		/* When referenced PTR_TO_BTF_ID is passed to release function,
 		 * it's fixed offset must be 0.	In the other cases, fixed offset
 		 * can be non-zero.
@@ -13690,6 +13704,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			break;
 		case PTR_TO_BTF_ID:
 		case PTR_TO_BTF_ID | PTR_UNTRUSTED:
+		/* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
+		 * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+		 * be said once it is marked PTR_UNTRUSTED, hence we must handle
+		 * any faults for loads into such types. BPF_WRITE is disallowed
+		 * for this case.
+		 */
+		case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
 			if (type == BPF_READ) {
 				insn->code = BPF_LDX | BPF_PROBE_MEM |
 					BPF_SIZE((insn)->code);
-- 
cgit v1.2.3


From 8ffa5cc142137a59d6a10eb5273fa2ba5dcd4947 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 18 Nov 2022 07:25:56 +0530
Subject: bpf: Recognize lock and list fields in allocated objects

Allow specifying bpf_spin_lock, bpf_list_head, bpf_list_node fields in a
allocated object.

Also update btf_struct_access to reject direct access to these special
fields.

A bpf_list_head allows implementing map-in-map style use cases, where an
allocated object with bpf_list_head is linked into a list in a map
value. This would require embedding a bpf_list_node, support for which
is also included. The bpf_spin_lock is used to protect the bpf_list_head
and other data.

While we strictly don't require to hold a bpf_spin_lock while touching
the bpf_list_head in such objects, as when have access to it, we have
complete ownership of the object, the locking constraint is still kept
and may be conditionally lifted in the future.

Note that the specification of such types can be done just like map
values, e.g.:

struct bar {
	struct bpf_list_node node;
};

struct foo {
	struct bpf_spin_lock lock;
	struct bpf_list_head head __contains(bar, node);
	struct bpf_list_node node;
};

struct map_value {
	struct bpf_spin_lock lock;
	struct bpf_list_head head __contains(foo, node);
};

To recognize such types in user BTF, we build a btf_struct_metas array
of metadata items corresponding to each BTF ID. This is done once during
the btf_parse stage to avoid having to do it each time during the
verification process's requirement to inspect the metadata.

Moreover, the computed metadata needs to be passed to some helpers in
future patches which requires allocating them and storing them in the
BTF that is pinned by the program itself, so that valid access can be
assumed to such data during program runtime.

A key thing to note is that once a btf_struct_meta is available for a
type, both the btf_record and btf_field_offs should be available. It is
critical that btf_field_offs is available in case special fields are
present, as we extensively rely on special fields being zeroed out in
map values and allocated objects in later patches. The code ensures that
by bailing out in case of errors and ensuring both are available
together. If the record is not available, the special fields won't be
recognized, so not having both is also fine (in terms of being a
verification error and not a runtime bug).

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221118015614.2013203-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |   7 ++
 include/linux/btf.h  |  35 +++++++++
 kernel/bpf/btf.c     | 197 ++++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/bpf/syscall.c |   4 ++
 4 files changed, 225 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7440c20c4192..eb6ea53fa5a2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -177,6 +177,7 @@ enum btf_field_type {
 	BPF_KPTR_REF   = (1 << 3),
 	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
 	BPF_LIST_HEAD  = (1 << 4),
+	BPF_LIST_NODE  = (1 << 5),
 };
 
 struct btf_field_kptr {
@@ -277,6 +278,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
 		return "kptr";
 	case BPF_LIST_HEAD:
 		return "bpf_list_head";
+	case BPF_LIST_NODE:
+		return "bpf_list_node";
 	default:
 		WARN_ON_ONCE(1);
 		return "unknown";
@@ -295,6 +298,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(u64);
 	case BPF_LIST_HEAD:
 		return sizeof(struct bpf_list_head);
+	case BPF_LIST_NODE:
+		return sizeof(struct bpf_list_node);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -313,6 +318,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(u64);
 	case BPF_LIST_HEAD:
 		return __alignof__(struct bpf_list_head);
+	case BPF_LIST_NODE:
+		return __alignof__(struct bpf_list_node);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index d80345fa566b..a01a8da20021 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -6,6 +6,8 @@
 
 #include <linux/types.h>
 #include <linux/bpfptr.h>
+#include <linux/bsearch.h>
+#include <linux/btf_ids.h>
 #include <uapi/linux/btf.h>
 #include <uapi/linux/bpf.h>
 
@@ -78,6 +80,17 @@ struct btf_id_dtor_kfunc {
 	u32 kfunc_btf_id;
 };
 
+struct btf_struct_meta {
+	u32 btf_id;
+	struct btf_record *record;
+	struct btf_field_offs *field_offs;
+};
+
+struct btf_struct_metas {
+	u32 cnt;
+	struct btf_struct_meta types[];
+};
+
 typedef void (*btf_dtor_kfunc_t)(void *);
 
 extern const struct file_operations btf_fops;
@@ -408,6 +421,23 @@ static inline struct btf_param *btf_params(const struct btf_type *t)
 	return (struct btf_param *)(t + 1);
 }
 
+static inline int btf_id_cmp_func(const void *a, const void *b)
+{
+	const int *pa = a, *pb = b;
+
+	return *pa - *pb;
+}
+
+static inline bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
+{
+	return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
+}
+
+static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
+{
+	return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 struct bpf_prog;
 
@@ -423,6 +453,7 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
 int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
 				struct module *owner);
+struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id);
 #else
 static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
 						    u32 type_id)
@@ -454,6 +485,10 @@ static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dt
 {
 	return 0;
 }
+static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
+{
+	return NULL;
+}
 #endif
 
 static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9a596f430558..a04e10477567 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -237,6 +237,7 @@ struct btf {
 	struct rcu_head rcu;
 	struct btf_kfunc_set_tab *kfunc_set_tab;
 	struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
+	struct btf_struct_metas *struct_meta_tab;
 
 	/* split BTF support */
 	struct btf *base_btf;
@@ -1642,8 +1643,30 @@ static void btf_free_dtor_kfunc_tab(struct btf *btf)
 	btf->dtor_kfunc_tab = NULL;
 }
 
+static void btf_struct_metas_free(struct btf_struct_metas *tab)
+{
+	int i;
+
+	if (!tab)
+		return;
+	for (i = 0; i < tab->cnt; i++) {
+		btf_record_free(tab->types[i].record);
+		kfree(tab->types[i].field_offs);
+	}
+	kfree(tab);
+}
+
+static void btf_free_struct_meta_tab(struct btf *btf)
+{
+	struct btf_struct_metas *tab = btf->struct_meta_tab;
+
+	btf_struct_metas_free(tab);
+	btf->struct_meta_tab = NULL;
+}
+
 static void btf_free(struct btf *btf)
 {
+	btf_free_struct_meta_tab(btf);
 	btf_free_dtor_kfunc_tab(btf);
 	btf_free_kfunc_set_tab(btf);
 	kvfree(btf->types);
@@ -3353,6 +3376,12 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
 			goto end;
 		}
 	}
+	if (field_mask & BPF_LIST_NODE) {
+		if (!strcmp(name, "bpf_list_node")) {
+			type = BPF_LIST_NODE;
+			goto end;
+		}
+	}
 	/* Only return BPF_KPTR when all other types with matchable names fail */
 	if (field_mask & BPF_KPTR) {
 		type = BPF_KPTR_REF;
@@ -3396,6 +3425,7 @@ static int btf_find_struct_field(const struct btf *btf,
 		switch (field_type) {
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
+		case BPF_LIST_NODE:
 			ret = btf_find_struct(btf, member_type, off, sz, field_type,
 					      idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3456,6 +3486,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 		switch (field_type) {
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
+		case BPF_LIST_NODE:
 			ret = btf_find_struct(btf, var_type, off, sz, field_type,
 					      idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3671,6 +3702,8 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			if (ret < 0)
 				goto end;
 			break;
+		case BPF_LIST_NODE:
+			break;
 		default:
 			ret = -EFAULT;
 			goto end;
@@ -5141,6 +5174,119 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
 	return btf_check_sec_info(env, btf_data_size);
 }
 
+static const char *alloc_obj_fields[] = {
+	"bpf_spin_lock",
+	"bpf_list_head",
+	"bpf_list_node",
+};
+
+static struct btf_struct_metas *
+btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
+{
+	union {
+		struct btf_id_set set;
+		struct {
+			u32 _cnt;
+			u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
+		} _arr;
+	} aof;
+	struct btf_struct_metas *tab = NULL;
+	int i, n, id, ret;
+
+	BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
+	BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
+
+	memset(&aof, 0, sizeof(aof));
+	for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
+		/* Try to find whether this special type exists in user BTF, and
+		 * if so remember its ID so we can easily find it among members
+		 * of structs that we iterate in the next loop.
+		 */
+		id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
+		if (id < 0)
+			continue;
+		aof.set.ids[aof.set.cnt++] = id;
+	}
+
+	if (!aof.set.cnt)
+		return NULL;
+	sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
+
+	n = btf_nr_types(btf);
+	for (i = 1; i < n; i++) {
+		struct btf_struct_metas *new_tab;
+		const struct btf_member *member;
+		struct btf_field_offs *foffs;
+		struct btf_struct_meta *type;
+		struct btf_record *record;
+		const struct btf_type *t;
+		int j, tab_cnt;
+
+		t = btf_type_by_id(btf, i);
+		if (!t) {
+			ret = -EINVAL;
+			goto free;
+		}
+		if (!__btf_type_is_struct(t))
+			continue;
+
+		cond_resched();
+
+		for_each_member(j, t, member) {
+			if (btf_id_set_contains(&aof.set, member->type))
+				goto parse;
+		}
+		continue;
+	parse:
+		tab_cnt = tab ? tab->cnt : 0;
+		new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]),
+				   GFP_KERNEL | __GFP_NOWARN);
+		if (!new_tab) {
+			ret = -ENOMEM;
+			goto free;
+		}
+		if (!tab)
+			new_tab->cnt = 0;
+		tab = new_tab;
+
+		type = &tab->types[tab->cnt];
+		type->btf_id = i;
+		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE, t->size);
+		/* The record cannot be unset, treat it as an error if so */
+		if (IS_ERR_OR_NULL(record)) {
+			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
+			goto free;
+		}
+		foffs = btf_parse_field_offs(record);
+		/* We need the field_offs to be valid for a valid record,
+		 * either both should be set or both should be unset.
+		 */
+		if (IS_ERR_OR_NULL(foffs)) {
+			btf_record_free(record);
+			ret = -EFAULT;
+			goto free;
+		}
+		type->record = record;
+		type->field_offs = foffs;
+		tab->cnt++;
+	}
+	return tab;
+free:
+	btf_struct_metas_free(tab);
+	return ERR_PTR(ret);
+}
+
+struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
+{
+	struct btf_struct_metas *tab;
+
+	BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0);
+	tab = btf->struct_meta_tab;
+	if (!tab)
+		return NULL;
+	return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func);
+}
+
 static int btf_check_type_tags(struct btf_verifier_env *env,
 			       struct btf *btf, int start_id)
 {
@@ -5191,6 +5337,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 			     u32 log_level, char __user *log_ubuf, u32 log_size)
 {
+	struct btf_struct_metas *struct_meta_tab;
 	struct btf_verifier_env *env = NULL;
 	struct bpf_verifier_log *log;
 	struct btf *btf = NULL;
@@ -5259,15 +5406,24 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	if (err)
 		goto errout;
 
+	struct_meta_tab = btf_parse_struct_metas(log, btf);
+	if (IS_ERR(struct_meta_tab)) {
+		err = PTR_ERR(struct_meta_tab);
+		goto errout;
+	}
+	btf->struct_meta_tab = struct_meta_tab;
+
 	if (log->level && bpf_verifier_log_full(log)) {
 		err = -ENOSPC;
-		goto errout;
+		goto errout_meta;
 	}
 
 	btf_verifier_env_free(env);
 	refcount_set(&btf->refcnt, 1);
 	return btf;
 
+errout_meta:
+	btf_free_struct_meta_tab(btf);
 errout:
 	btf_verifier_env_free(env);
 	if (btf)
@@ -6028,6 +6184,28 @@ int btf_struct_access(struct bpf_verifier_log *log,
 	u32 id = reg->btf_id;
 	int err;
 
+	while (type_is_alloc(reg->type)) {
+		struct btf_struct_meta *meta;
+		struct btf_record *rec;
+		int i;
+
+		meta = btf_find_struct_meta(btf, id);
+		if (!meta)
+			break;
+		rec = meta->record;
+		for (i = 0; i < rec->cnt; i++) {
+			struct btf_field *field = &rec->fields[i];
+			u32 offset = field->offset;
+			if (off < offset + btf_field_type_size(field->type) && offset < off + size) {
+				bpf_log(log,
+					"direct access to %s is disallowed\n",
+					btf_field_type_name(field->type));
+				return -EACCES;
+			}
+		}
+		break;
+	}
+
 	t = btf_type_by_id(btf, id);
 	do {
 		err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
@@ -7269,23 +7447,6 @@ bool btf_is_module(const struct btf *btf)
 	return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
 }
 
-static int btf_id_cmp_func(const void *a, const void *b)
-{
-	const int *pa = a, *pb = b;
-
-	return *pa - *pb;
-}
-
-bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
-{
-	return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
-}
-
-static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
-{
-	return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
-}
-
 enum {
 	BTF_MODULE_F_LIVE = (1 << 0),
 };
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4c20dcbc6526..56ae97d490f4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -537,6 +537,7 @@ void btf_record_free(struct btf_record *rec)
 			btf_put(rec->fields[i].kptr.btf);
 			break;
 		case BPF_LIST_HEAD:
+		case BPF_LIST_NODE:
 			/* Nothing to release for bpf_list_head */
 			break;
 		default:
@@ -582,6 +583,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 			}
 			break;
 		case BPF_LIST_HEAD:
+		case BPF_LIST_NODE:
 			/* Nothing to acquire for bpf_list_head */
 			break;
 		default:
@@ -648,6 +650,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 				continue;
 			bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
 			break;
+		case BPF_LIST_NODE:
+			break;
 		default:
 			WARN_ON_ONCE(1);
 			continue;
-- 
cgit v1.2.3


From 865ce09a49d79d2b2c1d980f4c05ffc0b3517bdc Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 18 Nov 2022 07:25:57 +0530
Subject: bpf: Verify ownership relationships for user BTF types

Ensure that there can be no ownership cycles among different types by
way of having owning objects that can hold some other type as their
element. For instance, a map value can only hold allocated objects, but
these are allowed to have another bpf_list_head. To prevent unbounded
recursion while freeing resources, elements of bpf_list_head in local
kptrs can never have a bpf_list_head which are part of list in a map
value. Later patches will verify this by having dedicated BTF selftests.

Also, to make runtime destruction easier, once btf_struct_metas is fully
populated, we can stash the metadata of the value type directly in the
metadata of the list_head fields, as that allows easier access to the
value type's layout to destruct it at runtime from the btf_field entry
of the list head itself.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221118015614.2013203-8-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  1 +
 include/linux/btf.h  |  1 +
 kernel/bpf/btf.c     | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c |  4 +++
 4 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eb6ea53fa5a2..323985a39ece 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -191,6 +191,7 @@ struct btf_field_list_head {
 	struct btf *btf;
 	u32 value_btf_id;
 	u32 node_offset;
+	struct btf_record *value_rec;
 };
 
 struct btf_field {
diff --git a/include/linux/btf.h b/include/linux/btf.h
index a01a8da20021..42d8f3730a8d 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -178,6 +178,7 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
 int btf_find_timer(const struct btf *btf, const struct btf_type *t);
 struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
 				    u32 field_mask, u32 value_size);
+int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec);
 struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec);
 bool btf_type_is_void(const struct btf_type *t);
 s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a04e10477567..91aa9c96621f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3723,6 +3723,67 @@ end:
 	return ERR_PTR(ret);
 }
 
+int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
+{
+	int i;
+
+	/* There are two owning types, kptr_ref and bpf_list_head. The former
+	 * only supports storing kernel types, which can never store references
+	 * to program allocated local types, atleast not yet. Hence we only need
+	 * to ensure that bpf_list_head ownership does not form cycles.
+	 */
+	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_LIST_HEAD))
+		return 0;
+	for (i = 0; i < rec->cnt; i++) {
+		struct btf_struct_meta *meta;
+		u32 btf_id;
+
+		if (!(rec->fields[i].type & BPF_LIST_HEAD))
+			continue;
+		btf_id = rec->fields[i].list_head.value_btf_id;
+		meta = btf_find_struct_meta(btf, btf_id);
+		if (!meta)
+			return -EFAULT;
+		rec->fields[i].list_head.value_rec = meta->record;
+
+		if (!(rec->field_mask & BPF_LIST_NODE))
+			continue;
+
+		/* We need to ensure ownership acyclicity among all types. The
+		 * proper way to do it would be to topologically sort all BTF
+		 * IDs based on the ownership edges, since there can be multiple
+		 * bpf_list_head in a type. Instead, we use the following
+		 * reasoning:
+		 *
+		 * - A type can only be owned by another type in user BTF if it
+		 *   has a bpf_list_node.
+		 * - A type can only _own_ another type in user BTF if it has a
+		 *   bpf_list_head.
+		 *
+		 * We ensure that if a type has both bpf_list_head and
+		 * bpf_list_node, its element types cannot be owning types.
+		 *
+		 * To ensure acyclicity:
+		 *
+		 * When A only has bpf_list_head, ownership chain can be:
+		 *	A -> B -> C
+		 * Where:
+		 * - B has both bpf_list_head and bpf_list_node.
+		 * - C only has bpf_list_node.
+		 *
+		 * When A has both bpf_list_head and bpf_list_node, some other
+		 * type already owns it in the BTF domain, hence it can not own
+		 * another owning type through any of the bpf_list_head edges.
+		 *	A -> B
+		 * Where:
+		 * - B only has bpf_list_node.
+		 */
+		if (meta->record->field_mask & BPF_LIST_HEAD)
+			return -ELOOP;
+	}
+	return 0;
+}
+
 static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv)
 {
 	const u32 a = *(const u32 *)_a;
@@ -5413,6 +5474,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	}
 	btf->struct_meta_tab = struct_meta_tab;
 
+	if (struct_meta_tab) {
+		int i;
+
+		for (i = 0; i < struct_meta_tab->cnt; i++) {
+			err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record);
+			if (err < 0)
+				goto errout_meta;
+		}
+	}
+
 	if (log->level && bpf_verifier_log_full(log)) {
 		err = -ENOSPC;
 		goto errout_meta;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 56ae97d490f4..6140cbc3ed8a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1054,6 +1054,10 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 		}
 	}
 
+	ret = btf_check_and_fixup_fields(btf, map->record);
+	if (ret < 0)
+		goto free_map_tab;
+
 	if (map->ops->map_check_btf) {
 		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
 		if (ret < 0)
-- 
cgit v1.2.3


From d0d78c1df9b1dbfb5e172de473561ce09d5e9d39 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 18 Nov 2022 07:25:59 +0530
Subject: bpf: Allow locking bpf_spin_lock global variables

Global variables reside in maps accessible using direct_value_addr
callbacks, so giving each load instruction's rewrite a unique reg->id
disallows us from holding locks which are global.

The reason for preserving reg->id as a unique value for registers that
may point to spin lock is that two separate lookups are treated as two
separate memory regions, and any possible aliasing is ignored for the
purposes of spin lock correctness.

This is not great especially for the global variable case, which are
served from maps that have max_entries == 1, i.e. they always lead to
map values pointing into the same map value.

So refactor the active_spin_lock into a 'active_lock' structure which
represents the lock identity, and instead of the reg->id, remember two
fields, a pointer and the reg->id. The pointer will store reg->map_ptr
or reg->btf. It's only necessary to distinguish for the id == 0 case of
global variables, but always setting the pointer to a non-NULL value and
using the pointer to check whether the lock is held simplifies code in
the verifier.

This is generic enough to allow it for global variables, map lookups,
and allocated objects at the same time.

Note that while whether a lock is held can be answered by just comparing
active_lock.ptr to NULL, to determine whether the register is pointing
to the same held lock requires comparing _both_ ptr and id.

Finally, as a result of this refactoring, pseudo load instructions are
not given a unique reg->id, as they are doing lookup for the same map
value (max_entries is never greater than 1).

Essentially, we consider that the tuple of (ptr, id) will always be
unique for any kind of argument to bpf_spin_{lock,unlock}.

Note that this can be extended in the future to also remember offset
used for locking, so that we can introduce multiple bpf_spin_lock fields
in the same allocation.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221118015614.2013203-10-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 16 +++++++++++++++-
 kernel/bpf/verifier.c        | 41 ++++++++++++++++++++++++++++-------------
 2 files changed, 43 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1a32baa78ce2..1db2b4dc7009 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -323,7 +323,21 @@ struct bpf_verifier_state {
 	u32 branches;
 	u32 insn_idx;
 	u32 curframe;
-	u32 active_spin_lock;
+	/* For every reg representing a map value or allocated object pointer,
+	 * we consider the tuple of (ptr, id) for them to be unique in verifier
+	 * context and conside them to not alias each other for the purposes of
+	 * tracking lock state.
+	 */
+	struct {
+		/* This can either be reg->map_ptr or reg->btf. If ptr is NULL,
+		 * there's no active lock held, and other fields have no
+		 * meaning. If non-NULL, it indicates that a lock is held and
+		 * id member has the reg->id of the register which can be >= 0.
+		 */
+		void *ptr;
+		/* This will be reg->id */
+		u32 id;
+	} active_lock;
 	bool speculative;
 
 	/* first and last insn idx of this verifier state */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 19467dda5dd9..c8f3abe9b08e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1221,7 +1221,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	}
 	dst_state->speculative = src->speculative;
 	dst_state->curframe = src->curframe;
-	dst_state->active_spin_lock = src->active_spin_lock;
+	dst_state->active_lock.ptr = src->active_lock.ptr;
+	dst_state->active_lock.id = src->active_lock.id;
 	dst_state->branches = src->branches;
 	dst_state->parent = src->parent;
 	dst_state->first_insn_idx = src->first_insn_idx;
@@ -5596,7 +5597,7 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state
  * Since only one bpf_spin_lock is allowed the checks are simpler than
  * reg_is_refcounted() logic. The verifier needs to remember only
  * one spin_lock instead of array of acquired_refs.
- * cur_state->active_spin_lock remembers which map value element or allocated
+ * cur_state->active_lock remembers which map value element or allocated
  * object got locked and clears it after bpf_spin_unlock.
  */
 static int process_spin_lock(struct bpf_verifier_env *env, int regno,
@@ -5640,22 +5641,35 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 		return -EINVAL;
 	}
 	if (is_lock) {
-		if (cur->active_spin_lock) {
+		if (cur->active_lock.ptr) {
 			verbose(env,
 				"Locking two bpf_spin_locks are not allowed\n");
 			return -EINVAL;
 		}
-		cur->active_spin_lock = reg->id;
+		if (map)
+			cur->active_lock.ptr = map;
+		else
+			cur->active_lock.ptr = btf;
+		cur->active_lock.id = reg->id;
 	} else {
-		if (!cur->active_spin_lock) {
+		void *ptr;
+
+		if (map)
+			ptr = map;
+		else
+			ptr = btf;
+
+		if (!cur->active_lock.ptr) {
 			verbose(env, "bpf_spin_unlock without taking a lock\n");
 			return -EINVAL;
 		}
-		if (cur->active_spin_lock != reg->id) {
+		if (cur->active_lock.ptr != ptr ||
+		    cur->active_lock.id != reg->id) {
 			verbose(env, "bpf_spin_unlock of different lock\n");
 			return -EINVAL;
 		}
-		cur->active_spin_lock = 0;
+		cur->active_lock.ptr = NULL;
+		cur->active_lock.id = 0;
 	}
 	return 0;
 }
@@ -10617,8 +10631,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	    insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
 		dst_reg->type = PTR_TO_MAP_VALUE;
 		dst_reg->off = aux->map_off;
-		if (btf_record_has_field(map->record, BPF_SPIN_LOCK))
-			dst_reg->id = ++env->id_gen;
+		WARN_ON_ONCE(map->max_entries != 1);
+		/* We want reg->id to be same (0) as map_value is not distinct */
 	} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
 		   insn->src_reg == BPF_PSEUDO_MAP_IDX) {
 		dst_reg->type = CONST_PTR_TO_MAP;
@@ -10696,7 +10710,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return err;
 	}
 
-	if (env->cur_state->active_spin_lock) {
+	if (env->cur_state->active_lock.ptr) {
 		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
 		return -EINVAL;
 	}
@@ -11962,7 +11976,8 @@ static bool states_equal(struct bpf_verifier_env *env,
 	if (old->speculative && !cur->speculative)
 		return false;
 
-	if (old->active_spin_lock != cur->active_spin_lock)
+	if (old->active_lock.ptr != cur->active_lock.ptr ||
+	    old->active_lock.id != cur->active_lock.id)
 		return false;
 
 	/* for states to be equal callsites have to be the same
@@ -12607,7 +12622,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
-				if (env->cur_state->active_spin_lock &&
+				if (env->cur_state->active_lock.ptr &&
 				    (insn->src_reg == BPF_PSEUDO_CALL ||
 				     insn->imm != BPF_FUNC_spin_unlock)) {
 					verbose(env, "function calls are not allowed while holding a lock\n");
@@ -12644,7 +12659,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
-				if (env->cur_state->active_spin_lock) {
+				if (env->cur_state->active_lock.ptr) {
 					verbose(env, "bpf_spin_unlock is missing\n");
 					return -EINVAL;
 				}
-- 
cgit v1.2.3


From 00b85860feb809852af9a88cb4ca8766d7dff6a3 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 18 Nov 2022 07:26:01 +0530
Subject: bpf: Rewrite kfunc argument handling

As we continue to add more features, argument types, kfunc flags, and
different extensions to kfuncs, the code to verify the correctness of
the kfunc prototype wrt the passed in registers has become ad-hoc and
ugly to read. To make life easier, and make a very clear split between
different stages of argument processing, move all the code into
verifier.c and refactor into easier to read helpers and functions.

This also makes sharing code within the verifier easier with kfunc
argument processing. This will be more and more useful in later patches
as we are now moving to implement very core BPF helpers as kfuncs, to
keep them experimental before baking into UAPI.

Remove all kfunc related bits now from btf_check_func_arg_match, as
users have been converted away to refactored kfunc argument handling.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221118015614.2013203-12-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  11 -
 include/linux/bpf_verifier.h                       |   2 -
 include/linux/btf.h                                |  31 +-
 kernel/bpf/btf.c                                   | 380 +-------------
 kernel/bpf/verifier.c                              | 545 ++++++++++++++++++++-
 .../selftests/bpf/prog_tests/kfunc_dynptr_param.c  |   2 +-
 tools/testing/selftests/bpf/verifier/calls.c       |   2 +-
 .../testing/selftests/bpf/verifier/ref_tracking.c  |   4 +-
 8 files changed, 573 insertions(+), 404 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 323985a39ece..0a74df731eb8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2109,22 +2109,11 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   const char *func_name,
 			   struct btf_func_model *m);
 
-struct bpf_kfunc_arg_meta {
-	u64 r0_size;
-	bool r0_rdonly;
-	int ref_obj_id;
-	u32 flags;
-};
-
 struct bpf_reg_state;
 int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
-int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
-			      const struct btf *btf, u32 func_id,
-			      struct bpf_reg_state *regs,
-			      struct bpf_kfunc_arg_meta *meta);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			  struct bpf_reg_state *reg);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1db2b4dc7009..fb146b0ce006 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -603,8 +603,6 @@ int check_ptr_off_reg(struct bpf_verifier_env *env,
 int check_func_arg_reg_off(struct bpf_verifier_env *env,
 			   const struct bpf_reg_state *reg, int regno,
 			   enum bpf_arg_type arg_type);
-int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-			     u32 regno);
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size);
 bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 42d8f3730a8d..d5b26380a60f 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -338,6 +338,16 @@ static inline bool btf_type_is_struct(const struct btf_type *t)
 	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
 }
 
+static inline bool __btf_type_is_struct(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
+}
+
+static inline bool btf_type_is_array(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
+}
+
 static inline u16 btf_type_vlen(const struct btf_type *t)
 {
 	return BTF_INFO_VLEN(t->info);
@@ -439,9 +449,10 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
 	return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
 }
 
-#ifdef CONFIG_BPF_SYSCALL
 struct bpf_prog;
+struct bpf_verifier_log;
 
+#ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
 const char *btf_name_by_offset(const struct btf *btf, u32 offset);
 struct btf *btf_parse_vmlinux(void);
@@ -455,6 +466,12 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
 int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
 				struct module *owner);
 struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id);
+const struct btf_member *
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+		      const struct btf_type *t, enum bpf_prog_type prog_type,
+		      int arg);
+bool btf_types_are_same(const struct btf *btf1, u32 id1,
+			const struct btf *btf2, u32 id2);
 #else
 static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
 						    u32 type_id)
@@ -490,6 +507,18 @@ static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf
 {
 	return NULL;
 }
+static inline const struct btf_member *
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+		      const struct btf_type *t, enum bpf_prog_type prog_type,
+		      int arg)
+{
+	return NULL;
+}
+static inline bool btf_types_are_same(const struct btf *btf1, u32 id1,
+				      const struct btf *btf2, u32 id2)
+{
+	return false;
+}
 #endif
 
 static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 91aa9c96621f..4dcda4ae48c1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -478,16 +478,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t)
 	return !t || btf_type_nosize(t);
 }
 
-static bool __btf_type_is_struct(const struct btf_type *t)
-{
-	return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
-}
-
-static bool btf_type_is_array(const struct btf_type *t)
-{
-	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
-}
-
 static bool btf_type_is_datasec(const struct btf_type *t)
 {
 	return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
@@ -5536,7 +5526,7 @@ static u8 bpf_ctx_convert_map[] = {
 #undef BPF_MAP_TYPE
 #undef BPF_LINK_TYPE
 
-static const struct btf_member *
+const struct btf_member *
 btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 		      const struct btf_type *t, enum bpf_prog_type prog_type,
 		      int arg)
@@ -6322,8 +6312,8 @@ int btf_struct_access(struct bpf_verifier_log *log,
  * end up with two different module BTFs, but IDs point to the common type in
  * vmlinux BTF.
  */
-static bool btf_types_are_same(const struct btf *btf1, u32 id1,
-			       const struct btf *btf2, u32 id2)
+bool btf_types_are_same(const struct btf *btf1, u32 id1,
+			const struct btf *btf2, u32 id2)
 {
 	if (id1 != id2)
 		return false;
@@ -6605,122 +6595,19 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
-static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
-#ifdef CONFIG_NET
-	[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
-	[PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
-	[PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
-#endif
-};
-
-/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
-static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
-					const struct btf *btf,
-					const struct btf_type *t, int rec)
-{
-	const struct btf_type *member_type;
-	const struct btf_member *member;
-	u32 i;
-
-	if (!btf_type_is_struct(t))
-		return false;
-
-	for_each_member(i, t, member) {
-		const struct btf_array *array;
-
-		member_type = btf_type_skip_modifiers(btf, member->type, NULL);
-		if (btf_type_is_struct(member_type)) {
-			if (rec >= 3) {
-				bpf_log(log, "max struct nesting depth exceeded\n");
-				return false;
-			}
-			if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
-				return false;
-			continue;
-		}
-		if (btf_type_is_array(member_type)) {
-			array = btf_type_array(member_type);
-			if (!array->nelems)
-				return false;
-			member_type = btf_type_skip_modifiers(btf, array->type, NULL);
-			if (!btf_type_is_scalar(member_type))
-				return false;
-			continue;
-		}
-		if (!btf_type_is_scalar(member_type))
-			return false;
-	}
-	return true;
-}
-
-static bool is_kfunc_arg_mem_size(const struct btf *btf,
-				  const struct btf_param *arg,
-				  const struct bpf_reg_state *reg)
-{
-	int len, sfx_len = sizeof("__sz") - 1;
-	const struct btf_type *t;
-	const char *param_name;
-
-	t = btf_type_skip_modifiers(btf, arg->type, NULL);
-	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
-		return false;
-
-	/* In the future, this can be ported to use BTF tagging */
-	param_name = btf_name_by_offset(btf, arg->name_off);
-	if (str_is_empty(param_name))
-		return false;
-	len = strlen(param_name);
-	if (len < sfx_len)
-		return false;
-	param_name += len - sfx_len;
-	if (strncmp(param_name, "__sz", sfx_len))
-		return false;
-
-	return true;
-}
-
-static bool btf_is_kfunc_arg_mem_size(const struct btf *btf,
-				      const struct btf_param *arg,
-				      const struct bpf_reg_state *reg,
-				      const char *name)
-{
-	int len, target_len = strlen(name);
-	const struct btf_type *t;
-	const char *param_name;
-
-	t = btf_type_skip_modifiers(btf, arg->type, NULL);
-	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
-		return false;
-
-	param_name = btf_name_by_offset(btf, arg->name_off);
-	if (str_is_empty(param_name))
-		return false;
-	len = strlen(param_name);
-	if (len != target_len)
-		return false;
-	if (strcmp(param_name, name))
-		return false;
-
-	return true;
-}
-
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
 				    bool ptr_to_mem_ok,
-				    struct bpf_kfunc_arg_meta *kfunc_meta,
 				    bool processing_call)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
-	bool rel = false, kptr_get = false, trusted_args = false;
-	bool sleepable = false;
 	struct bpf_verifier_log *log = &env->log;
-	u32 i, nargs, ref_id, ref_obj_id = 0;
-	bool is_kfunc = btf_is_kernel(btf);
 	const char *func_name, *ref_tname;
 	const struct btf_type *t, *ref_t;
 	const struct btf_param *args;
-	int ref_regno = 0, ret;
+	u32 i, nargs, ref_id;
+	int ret;
 
 	t = btf_type_by_id(btf, func_id);
 	if (!t || !btf_type_is_func(t)) {
@@ -6746,14 +6633,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (is_kfunc && kfunc_meta) {
-		/* Only kfunc can be release func */
-		rel = kfunc_meta->flags & KF_RELEASE;
-		kptr_get = kfunc_meta->flags & KF_KPTR_GET;
-		trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
-		sleepable = kfunc_meta->flags & KF_SLEEPABLE;
-	}
-
 	/* check that BTF function arguments match actual types that the
 	 * verifier sees.
 	 */
@@ -6761,42 +6640,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		enum bpf_arg_type arg_type = ARG_DONTCARE;
 		u32 regno = i + 1;
 		struct bpf_reg_state *reg = &regs[regno];
-		bool obj_ptr = false;
 
 		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
 		if (btf_type_is_scalar(t)) {
-			if (is_kfunc && kfunc_meta) {
-				bool is_buf_size = false;
-
-				/* check for any const scalar parameter of name "rdonly_buf_size"
-				 * or "rdwr_buf_size"
-				 */
-				if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
-							      "rdonly_buf_size")) {
-					kfunc_meta->r0_rdonly = true;
-					is_buf_size = true;
-				} else if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
-								     "rdwr_buf_size"))
-					is_buf_size = true;
-
-				if (is_buf_size) {
-					if (kfunc_meta->r0_size) {
-						bpf_log(log, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
-						return -EINVAL;
-					}
-
-					if (!tnum_is_const(reg->var_off)) {
-						bpf_log(log, "R%d is not a const\n", regno);
-						return -EINVAL;
-					}
-
-					kfunc_meta->r0_size = reg->var_off.value;
-					ret = mark_chain_precision(env, regno);
-					if (ret)
-						return ret;
-				}
-			}
-
 			if (reg->type == SCALAR_VALUE)
 				continue;
 			bpf_log(log, "R%d is not a scalar\n", regno);
@@ -6809,88 +6655,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 
-		/* These register types have special constraints wrt ref_obj_id
-		 * and offset checks. The rest of trusted args don't.
-		 */
-		obj_ptr = reg->type == PTR_TO_CTX || reg->type == PTR_TO_BTF_ID ||
-			  reg2btf_ids[base_type(reg->type)];
-
-		/* Check if argument must be a referenced pointer, args + i has
-		 * been verified to be a pointer (after skipping modifiers).
-		 * PTR_TO_CTX is ok without having non-zero ref_obj_id.
-		 */
-		if (is_kfunc && trusted_args && (obj_ptr && reg->type != PTR_TO_CTX) && !reg->ref_obj_id) {
-			bpf_log(log, "R%d must be referenced\n", regno);
-			return -EINVAL;
-		}
-
 		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
 
-		/* Trusted args have the same offset checks as release arguments */
-		if ((trusted_args && obj_ptr) || (rel && reg->ref_obj_id))
-			arg_type |= OBJ_RELEASE;
 		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
 		if (ret < 0)
 			return ret;
 
-		if (is_kfunc && reg->ref_obj_id) {
-			/* Ensure only one argument is referenced PTR_TO_BTF_ID */
-			if (ref_obj_id) {
-				bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
-					regno, reg->ref_obj_id, ref_obj_id);
-				return -EFAULT;
-			}
-			ref_regno = regno;
-			ref_obj_id = reg->ref_obj_id;
-		}
-
-		/* kptr_get is only true for kfunc */
-		if (i == 0 && kptr_get) {
-			struct btf_field *kptr_field;
-
-			if (reg->type != PTR_TO_MAP_VALUE) {
-				bpf_log(log, "arg#0 expected pointer to map value\n");
-				return -EINVAL;
-			}
-
-			/* check_func_arg_reg_off allows var_off for
-			 * PTR_TO_MAP_VALUE, but we need fixed offset to find
-			 * off_desc.
-			 */
-			if (!tnum_is_const(reg->var_off)) {
-				bpf_log(log, "arg#0 must have constant offset\n");
-				return -EINVAL;
-			}
-
-			kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
-			if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
-				bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n",
-					reg->off + reg->var_off.value);
-				return -EINVAL;
-			}
-
-			if (!btf_type_is_ptr(ref_t)) {
-				bpf_log(log, "arg#0 BTF type must be a double pointer\n");
-				return -EINVAL;
-			}
-
-			ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id);
-			ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-
-			if (!btf_type_is_struct(ref_t)) {
-				bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
-					func_name, i, btf_type_str(ref_t), ref_tname);
-				return -EINVAL;
-			}
-			if (!btf_struct_ids_match(log, btf, ref_id, 0, kptr_field->kptr.btf,
-						  kptr_field->kptr.btf_id, true)) {
-				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n",
-					func_name, i, btf_type_str(ref_t), ref_tname);
-				return -EINVAL;
-			}
-			/* rest of the arguments can be anything, like normal kfunc */
-		} else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+		if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
 			/* If function expects ctx type in BTF check that caller
 			 * is passing PTR_TO_CTX.
 			 */
@@ -6900,109 +6672,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 					i, btf_type_str(t));
 				return -EINVAL;
 			}
-		} else if (is_kfunc && (reg->type == PTR_TO_BTF_ID ||
-			   (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) {
-			const struct btf_type *reg_ref_t;
-			const struct btf *reg_btf;
-			const char *reg_ref_tname;
-			u32 reg_ref_id;
-
-			if (!btf_type_is_struct(ref_t)) {
-				bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
-					func_name, i, btf_type_str(ref_t),
-					ref_tname);
-				return -EINVAL;
-			}
-
-			if (reg->type == PTR_TO_BTF_ID) {
-				reg_btf = reg->btf;
-				reg_ref_id = reg->btf_id;
-			} else {
-				reg_btf = btf_vmlinux;
-				reg_ref_id = *reg2btf_ids[base_type(reg->type)];
-			}
-
-			reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
-							    &reg_ref_id);
-			reg_ref_tname = btf_name_by_offset(reg_btf,
-							   reg_ref_t->name_off);
-			if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
-						  reg->off, btf, ref_id,
-						  trusted_args || (rel && reg->ref_obj_id))) {
-				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
-					func_name, i,
-					btf_type_str(ref_t), ref_tname,
-					regno, btf_type_str(reg_ref_t),
-					reg_ref_tname);
-				return -EINVAL;
-			}
 		} else if (ptr_to_mem_ok && processing_call) {
 			const struct btf_type *resolve_ret;
 			u32 type_size;
 
-			if (is_kfunc) {
-				bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
-				bool arg_dynptr = btf_type_is_struct(ref_t) &&
-						  !strcmp(ref_tname,
-							  stringify_struct(bpf_dynptr_kern));
-
-				/* Permit pointer to mem, but only when argument
-				 * type is pointer to scalar, or struct composed
-				 * (recursively) of scalars.
-				 * When arg_mem_size is true, the pointer can be
-				 * void *.
-				 * Also permit initialized local dynamic pointers.
-				 */
-				if (!btf_type_is_scalar(ref_t) &&
-				    !__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
-				    !arg_dynptr &&
-				    (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
-					bpf_log(log,
-						"arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
-						i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
-					return -EINVAL;
-				}
-
-				if (arg_dynptr) {
-					if (reg->type != PTR_TO_STACK) {
-						bpf_log(log, "arg#%d pointer type %s %s not to stack\n",
-							i, btf_type_str(ref_t),
-							ref_tname);
-						return -EINVAL;
-					}
-
-					if (!is_dynptr_reg_valid_init(env, reg)) {
-						bpf_log(log,
-							"arg#%d pointer type %s %s must be valid and initialized\n",
-							i, btf_type_str(ref_t),
-							ref_tname);
-						return -EINVAL;
-					}
-
-					if (!is_dynptr_type_expected(env, reg,
-							ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) {
-						bpf_log(log,
-							"arg#%d pointer type %s %s points to unsupported dynamic pointer type\n",
-							i, btf_type_str(ref_t),
-							ref_tname);
-						return -EINVAL;
-					}
-
-					continue;
-				}
-
-				/* Check for mem, len pair */
-				if (arg_mem_size) {
-					if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
-						bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n",
-							i, i + 1);
-						return -EINVAL;
-					}
-					i++;
-					continue;
-				}
-			}
-
 			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
 			if (IS_ERR(resolve_ret)) {
 				bpf_log(log,
@@ -7015,36 +6688,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			if (check_mem_reg(env, reg, regno, type_size))
 				return -EINVAL;
 		} else {
-			bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
-				is_kfunc ? "kernel " : "", func_name, func_id);
+			bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i,
+				func_name, func_id);
 			return -EINVAL;
 		}
 	}
 
-	/* Either both are set, or neither */
-	WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno));
-	/* We already made sure ref_obj_id is set only for one argument. We do
-	 * allow (!rel && ref_obj_id), so that passing such referenced
-	 * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when
-	 * is_kfunc is true.
-	 */
-	if (rel && !ref_obj_id) {
-		bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
-			func_name);
-		return -EINVAL;
-	}
-
-	if (sleepable && !env->prog->aux->sleepable) {
-		bpf_log(log, "kernel function %s is sleepable but the program is not\n",
-			func_name);
-		return -EINVAL;
-	}
-
-	if (kfunc_meta && ref_obj_id)
-		kfunc_meta->ref_obj_id = ref_obj_id;
-
-	/* returns argument register number > 0 in case of reference release kfunc */
-	return rel ? ref_regno : 0;
+	return 0;
 }
 
 /* Compare BTF of a function declaration with given bpf_reg_state.
@@ -7074,7 +6724,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 		return -EINVAL;
 
 	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, false);
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false);
 
 	/* Compiler optimizations can remove arguments from static functions
 	 * or mismatched type can be passed into a global function.
@@ -7117,7 +6767,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 		return -EINVAL;
 
 	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, true);
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true);
 
 	/* Compiler optimizations can remove arguments from static functions
 	 * or mismatched type can be passed into a global function.
@@ -7128,14 +6778,6 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 	return err;
 }
 
-int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
-			      const struct btf *btf, u32 func_id,
-			      struct bpf_reg_state *regs,
-			      struct bpf_kfunc_arg_meta *meta)
-{
-	return btf_check_func_arg_match(env, btf, func_id, regs, true, meta, true);
-}
-
 /* Convert BTF of a function into bpf_reg_state if possible
  * Returns:
  * EFAULT - there is a verifier bug. Abort verification.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c8f3abe9b08e..7d7a91c54709 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5550,8 +5550,8 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 	return err;
 }
 
-int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-			     u32 regno)
+static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				    u32 regno)
 {
 	struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
 	bool may_be_null = type_may_be_null(mem_reg->type);
@@ -7863,19 +7863,517 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
 	}
 }
 
+struct bpf_kfunc_call_arg_meta {
+	/* In parameters */
+	struct btf *btf;
+	u32 func_id;
+	u32 kfunc_flags;
+	const struct btf_type *func_proto;
+	const char *func_name;
+	/* Out parameters */
+	u32 ref_obj_id;
+	u8 release_regno;
+	bool r0_rdonly;
+	u64 r0_size;
+};
+
+static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ACQUIRE;
+}
+
+static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_RET_NULL;
+}
+
+static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_RELEASE;
+}
+
+static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_TRUSTED_ARGS;
+}
+
+static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_SLEEPABLE;
+}
+
+static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_DESTRUCTIVE;
+}
+
+static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+	return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
+}
+
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+				  const struct btf_param *arg,
+				  const struct bpf_reg_state *reg)
+{
+	int len, sfx_len = sizeof("__sz") - 1;
+	const struct btf_type *t;
+	const char *param_name;
+
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+		return false;
+
+	/* In the future, this can be ported to use BTF tagging */
+	param_name = btf_name_by_offset(btf, arg->name_off);
+	if (str_is_empty(param_name))
+		return false;
+	len = strlen(param_name);
+	if (len < sfx_len)
+		return false;
+	param_name += len - sfx_len;
+	if (strncmp(param_name, "__sz", sfx_len))
+		return false;
+
+	return true;
+}
+
+static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
+					  const struct btf_param *arg,
+					  const char *name)
+{
+	int len, target_len = strlen(name);
+	const char *param_name;
+
+	param_name = btf_name_by_offset(btf, arg->name_off);
+	if (str_is_empty(param_name))
+		return false;
+	len = strlen(param_name);
+	if (len != target_len)
+		return false;
+	if (strcmp(param_name, name))
+		return false;
+
+	return true;
+}
+
+enum {
+	KF_ARG_DYNPTR_ID,
+};
+
+BTF_ID_LIST(kf_arg_btf_ids)
+BTF_ID(struct, bpf_dynptr_kern)
+
+static bool is_kfunc_arg_dynptr(const struct btf *btf,
+				const struct btf_param *arg)
+{
+	const struct btf_type *t;
+	u32 res_id;
+
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!t)
+		return false;
+	if (!btf_type_is_ptr(t))
+		return false;
+	t = btf_type_skip_modifiers(btf, t->type, &res_id);
+	if (!t)
+		return false;
+	return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[KF_ARG_DYNPTR_ID]);
+}
+
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
+					const struct btf *btf,
+					const struct btf_type *t, int rec)
+{
+	const struct btf_type *member_type;
+	const struct btf_member *member;
+	u32 i;
+
+	if (!btf_type_is_struct(t))
+		return false;
+
+	for_each_member(i, t, member) {
+		const struct btf_array *array;
+
+		member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+		if (btf_type_is_struct(member_type)) {
+			if (rec >= 3) {
+				verbose(env, "max struct nesting depth exceeded\n");
+				return false;
+			}
+			if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1))
+				return false;
+			continue;
+		}
+		if (btf_type_is_array(member_type)) {
+			array = btf_array(member_type);
+			if (!array->nelems)
+				return false;
+			member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+			if (!btf_type_is_scalar(member_type))
+				return false;
+			continue;
+		}
+		if (!btf_type_is_scalar(member_type))
+			return false;
+	}
+	return true;
+}
+
+
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+	[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+	[PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+	[PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
+enum kfunc_ptr_arg_type {
+	KF_ARG_PTR_TO_CTX,
+	KF_ARG_PTR_TO_KPTR,	     /* PTR_TO_KPTR but type specific */
+	KF_ARG_PTR_TO_DYNPTR,
+	KF_ARG_PTR_TO_BTF_ID,	     /* Also covers reg2btf_ids conversions */
+	KF_ARG_PTR_TO_MEM,
+	KF_ARG_PTR_TO_MEM_SIZE,	     /* Size derived from next argument, skip it */
+};
+
+static enum kfunc_ptr_arg_type
+get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
+		       struct bpf_kfunc_call_arg_meta *meta,
+		       const struct btf_type *t, const struct btf_type *ref_t,
+		       const char *ref_tname, const struct btf_param *args,
+		       int argno, int nargs)
+{
+	u32 regno = argno + 1;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = &regs[regno];
+	bool arg_mem_size = false;
+
+	/* In this function, we verify the kfunc's BTF as per the argument type,
+	 * leaving the rest of the verification with respect to the register
+	 * type to our caller. When a set of conditions hold in the BTF type of
+	 * arguments, we resolve it to a known kfunc_ptr_arg_type.
+	 */
+	if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+		return KF_ARG_PTR_TO_CTX;
+
+	if (is_kfunc_arg_kptr_get(meta, argno)) {
+		if (!btf_type_is_ptr(ref_t)) {
+			verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n");
+			return -EINVAL;
+		}
+		ref_t = btf_type_by_id(meta->btf, ref_t->type);
+		ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off);
+		if (!btf_type_is_struct(ref_t)) {
+			verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n",
+				meta->func_name, btf_type_str(ref_t), ref_tname);
+			return -EINVAL;
+		}
+		return KF_ARG_PTR_TO_KPTR;
+	}
+
+	if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
+		return KF_ARG_PTR_TO_DYNPTR;
+
+	if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
+		if (!btf_type_is_struct(ref_t)) {
+			verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
+				meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+			return -EINVAL;
+		}
+		return KF_ARG_PTR_TO_BTF_ID;
+	}
+
+	if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]))
+		arg_mem_size = true;
+
+	/* This is the catch all argument type of register types supported by
+	 * check_helper_mem_access. However, we only allow when argument type is
+	 * pointer to scalar, or struct composed (recursively) of scalars. When
+	 * arg_mem_size is true, the pointer can be void *.
+	 */
+	if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
+	    (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
+		verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+			argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
+		return -EINVAL;
+	}
+	return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
+}
+
+static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
+					struct bpf_reg_state *reg,
+					const struct btf_type *ref_t,
+					const char *ref_tname, u32 ref_id,
+					struct bpf_kfunc_call_arg_meta *meta,
+					int argno)
+{
+	const struct btf_type *reg_ref_t;
+	bool strict_type_match = false;
+	const struct btf *reg_btf;
+	const char *reg_ref_tname;
+	u32 reg_ref_id;
+
+	if (reg->type == PTR_TO_BTF_ID) {
+		reg_btf = reg->btf;
+		reg_ref_id = reg->btf_id;
+	} else {
+		reg_btf = btf_vmlinux;
+		reg_ref_id = *reg2btf_ids[base_type(reg->type)];
+	}
+
+	if (is_kfunc_trusted_args(meta) || (is_kfunc_release(meta) && reg->ref_obj_id))
+		strict_type_match = true;
+
+	reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
+	reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
+	if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
+		verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+			meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
+			btf_type_str(reg_ref_t), reg_ref_tname);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
+				      struct bpf_reg_state *reg,
+				      const struct btf_type *ref_t,
+				      const char *ref_tname,
+				      struct bpf_kfunc_call_arg_meta *meta,
+				      int argno)
+{
+	struct btf_field *kptr_field;
+
+	/* check_func_arg_reg_off allows var_off for
+	 * PTR_TO_MAP_VALUE, but we need fixed offset to find
+	 * off_desc.
+	 */
+	if (!tnum_is_const(reg->var_off)) {
+		verbose(env, "arg#0 must have constant offset\n");
+		return -EINVAL;
+	}
+
+	kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
+	if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
+		verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n",
+			reg->off + reg->var_off.value);
+		return -EINVAL;
+	}
+
+	if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf,
+				  kptr_field->kptr.btf_id, true)) {
+		verbose(env, "kernel function %s args#%d expected pointer to %s %s\n",
+			meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta)
+{
+	const char *func_name = meta->func_name, *ref_tname;
+	const struct btf *btf = meta->btf;
+	const struct btf_param *args;
+	u32 i, nargs;
+	int ret;
+
+	args = (const struct btf_param *)(meta->func_proto + 1);
+	nargs = btf_type_vlen(meta->func_proto);
+	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+		verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
+			MAX_BPF_FUNC_REG_ARGS);
+		return -EINVAL;
+	}
+
+	/* Check that BTF function arguments match actual types that the
+	 * verifier sees.
+	 */
+	for (i = 0; i < nargs; i++) {
+		struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
+		const struct btf_type *t, *ref_t, *resolve_ret;
+		enum bpf_arg_type arg_type = ARG_DONTCARE;
+		u32 regno = i + 1, ref_id, type_size;
+		bool is_ret_buf_sz = false;
+		int kf_arg_type;
+
+		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+		if (btf_type_is_scalar(t)) {
+			if (reg->type != SCALAR_VALUE) {
+				verbose(env, "R%d is not a scalar\n", regno);
+				return -EINVAL;
+			}
+			if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
+				meta->r0_rdonly = true;
+				is_ret_buf_sz = true;
+			} else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
+				is_ret_buf_sz = true;
+			}
+
+			if (is_ret_buf_sz) {
+				if (meta->r0_size) {
+					verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
+					return -EINVAL;
+				}
+
+				if (!tnum_is_const(reg->var_off)) {
+					verbose(env, "R%d is not a const\n", regno);
+					return -EINVAL;
+				}
+
+				meta->r0_size = reg->var_off.value;
+				ret = mark_chain_precision(env, regno);
+				if (ret)
+					return ret;
+			}
+			continue;
+		}
+
+		if (!btf_type_is_ptr(t)) {
+			verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
+			return -EINVAL;
+		}
+
+		if (reg->ref_obj_id) {
+			if (is_kfunc_release(meta) && meta->ref_obj_id) {
+				verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+					regno, reg->ref_obj_id,
+					meta->ref_obj_id);
+				return -EFAULT;
+			}
+			meta->ref_obj_id = reg->ref_obj_id;
+			if (is_kfunc_release(meta))
+				meta->release_regno = regno;
+		}
+
+		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+		kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
+		if (kf_arg_type < 0)
+			return kf_arg_type;
+
+		switch (kf_arg_type) {
+		case KF_ARG_PTR_TO_BTF_ID:
+			if (!is_kfunc_trusted_args(meta))
+				break;
+			if (!reg->ref_obj_id) {
+				verbose(env, "R%d must be referenced\n", regno);
+				return -EINVAL;
+			}
+			fallthrough;
+		case KF_ARG_PTR_TO_CTX:
+			/* Trusted arguments have the same offset checks as release arguments */
+			arg_type |= OBJ_RELEASE;
+			break;
+		case KF_ARG_PTR_TO_KPTR:
+		case KF_ARG_PTR_TO_DYNPTR:
+		case KF_ARG_PTR_TO_MEM:
+		case KF_ARG_PTR_TO_MEM_SIZE:
+			/* Trusted by default */
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			return -EFAULT;
+		}
+
+		if (is_kfunc_release(meta) && reg->ref_obj_id)
+			arg_type |= OBJ_RELEASE;
+		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+		if (ret < 0)
+			return ret;
+
+		switch (kf_arg_type) {
+		case KF_ARG_PTR_TO_CTX:
+			if (reg->type != PTR_TO_CTX) {
+				verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
+				return -EINVAL;
+			}
+			break;
+		case KF_ARG_PTR_TO_KPTR:
+			if (reg->type != PTR_TO_MAP_VALUE) {
+				verbose(env, "arg#0 expected pointer to map value\n");
+				return -EINVAL;
+			}
+			ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i);
+			if (ret < 0)
+				return ret;
+			break;
+		case KF_ARG_PTR_TO_DYNPTR:
+			if (reg->type != PTR_TO_STACK) {
+				verbose(env, "arg#%d expected pointer to stack\n", i);
+				return -EINVAL;
+			}
+
+			if (!is_dynptr_reg_valid_init(env, reg)) {
+				verbose(env, "arg#%d pointer type %s %s must be valid and initialized\n",
+					i, btf_type_str(ref_t), ref_tname);
+				return -EINVAL;
+			}
+
+			if (!is_dynptr_type_expected(env, reg, ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) {
+				verbose(env, "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n",
+					i, btf_type_str(ref_t), ref_tname);
+				return -EINVAL;
+			}
+			break;
+		case KF_ARG_PTR_TO_BTF_ID:
+			/* Only base_type is checked, further checks are done here */
+			if (reg->type != PTR_TO_BTF_ID &&
+			    (!reg2btf_ids[base_type(reg->type)] || type_flag(reg->type))) {
+				verbose(env, "arg#%d expected pointer to btf or socket\n", i);
+				return -EINVAL;
+			}
+			ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+			if (ret < 0)
+				return ret;
+			break;
+		case KF_ARG_PTR_TO_MEM:
+			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+			if (IS_ERR(resolve_ret)) {
+				verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+					i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
+				return -EINVAL;
+			}
+			ret = check_mem_reg(env, reg, regno, type_size);
+			if (ret < 0)
+				return ret;
+			break;
+		case KF_ARG_PTR_TO_MEM_SIZE:
+			ret = check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1);
+			if (ret < 0) {
+				verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+				return ret;
+			}
+			/* Skip next '__sz' argument */
+			i++;
+			break;
+		}
+	}
+
+	if (is_kfunc_release(meta) && !meta->release_regno) {
+		verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+			func_name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    int *insn_idx_p)
 {
 	const struct btf_type *t, *func, *func_proto, *ptr_type;
 	struct bpf_reg_state *regs = cur_regs(env);
-	struct bpf_kfunc_arg_meta meta = { 0 };
 	const char *func_name, *ptr_type_name;
+	struct bpf_kfunc_call_arg_meta meta;
 	u32 i, nargs, func_id, ptr_type_id;
 	int err, insn_idx = *insn_idx_p;
 	const struct btf_param *args;
 	struct btf *desc_btf;
 	u32 *kfunc_flags;
-	bool acq;
 
 	/* skip for now, but return error when we find this in fixup_kfunc_call */
 	if (!insn->imm)
@@ -7896,24 +8394,34 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			func_name);
 		return -EACCES;
 	}
-	if (*kfunc_flags & KF_DESTRUCTIVE && !capable(CAP_SYS_BOOT)) {
-		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capabilities\n");
+
+	/* Prepare kfunc call metadata */
+	memset(&meta, 0, sizeof(meta));
+	meta.btf = desc_btf;
+	meta.func_id = func_id;
+	meta.kfunc_flags = *kfunc_flags;
+	meta.func_proto = func_proto;
+	meta.func_name = func_name;
+
+	if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
+		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
 		return -EACCES;
 	}
 
-	acq = *kfunc_flags & KF_ACQUIRE;
-
-	meta.flags = *kfunc_flags;
+	if (is_kfunc_sleepable(&meta) && !env->prog->aux->sleepable) {
+		verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
+		return -EACCES;
+	}
 
 	/* Check the arguments */
-	err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta);
+	err = check_kfunc_args(env, &meta);
 	if (err < 0)
 		return err;
 	/* In case of release function, we get register number of refcounted
-	 * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now
+	 * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
 	 */
-	if (err) {
-		err = release_reference(env, regs[err].ref_obj_id);
+	if (meta.release_regno) {
+		err = release_reference(env, regs[meta.release_regno].ref_obj_id);
 		if (err) {
 			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
 				func_name, func_id);
@@ -7927,7 +8435,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	/* Check return type */
 	t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
 
-	if (acq && !btf_type_is_struct_ptr(desc_btf, t)) {
+	if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
 		verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
 		return -EINVAL;
 	}
@@ -7966,20 +8474,23 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			regs[BPF_REG_0].type = PTR_TO_BTF_ID;
 			regs[BPF_REG_0].btf_id = ptr_type_id;
 		}
-		if (*kfunc_flags & KF_RET_NULL) {
+		if (is_kfunc_ret_null(&meta)) {
 			regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
 			/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
 			regs[BPF_REG_0].id = ++env->id_gen;
 		}
 		mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
-		if (acq) {
+		if (is_kfunc_acquire(&meta)) {
 			int id = acquire_reference_state(env, insn_idx);
 
 			if (id < 0)
 				return id;
-			regs[BPF_REG_0].id = id;
+			if (is_kfunc_ret_null(&meta))
+				regs[BPF_REG_0].id = id;
 			regs[BPF_REG_0].ref_obj_id = id;
 		}
+		if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
+			regs[BPF_REG_0].id = ++env->id_gen;
 	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
 
 	nargs = btf_type_vlen(func_proto);
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
index c210657d4d0a..55d641c1f126 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
@@ -22,7 +22,7 @@ static struct {
 	 "arg#0 pointer type STRUCT bpf_dynptr_kern points to unsupported dynamic pointer type", 0},
 	{"not_valid_dynptr",
 	 "arg#0 pointer type STRUCT bpf_dynptr_kern must be valid and initialized", 0},
-	{"not_ptr_to_stack", "arg#0 pointer type STRUCT bpf_dynptr_kern not to stack", 0},
+	{"not_ptr_to_stack", "arg#0 expected pointer to stack", 0},
 	{"dynptr_data_null", NULL, -EBADMSG},
 };
 
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index e1a937277b54..86d6fef2e3b4 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -109,7 +109,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "arg#0 pointer type STRUCT prog_test_ref_kfunc must point",
+	.errstr = "arg#0 expected pointer to btf or socket",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_acquire", 3 },
 		{ "bpf_kfunc_call_test_release", 5 },
diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c
index fd683a32a276..55cba01c99d5 100644
--- a/tools/testing/selftests/bpf/verifier/ref_tracking.c
+++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c
@@ -142,7 +142,7 @@
 	.kfunc = "bpf",
 	.expected_attach_type = BPF_LSM_MAC,
 	.flags = BPF_F_SLEEPABLE,
-	.errstr = "arg#0 pointer type STRUCT bpf_key must point to scalar, or struct with scalar",
+	.errstr = "arg#0 expected pointer to btf or socket",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_lookup_user_key", 2 },
 		{ "bpf_key_put", 4 },
@@ -163,7 +163,7 @@
 	.kfunc = "bpf",
 	.expected_attach_type = BPF_LSM_MAC,
 	.flags = BPF_F_SLEEPABLE,
-	.errstr = "arg#0 pointer type STRUCT bpf_key must point to scalar, or struct with scalar",
+	.errstr = "arg#0 expected pointer to btf or socket",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_lookup_system_key", 1 },
 		{ "bpf_key_put", 3 },
-- 
cgit v1.2.3


From 958cf2e273f0929c66169e0788031310e8118722 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 18 Nov 2022 07:26:03 +0530
Subject: bpf: Introduce bpf_obj_new

Introduce type safe memory allocator bpf_obj_new for BPF programs. The
kernel side kfunc is named bpf_obj_new_impl, as passing hidden arguments
to kfuncs still requires having them in prototype, unlike BPF helpers
which always take 5 arguments and have them checked using bpf_func_proto
in verifier, ignoring unset argument types.

Introduce __ign suffix to ignore a specific kfunc argument during type
checks, then use this to introduce support for passing type metadata to
the bpf_obj_new_impl kfunc.

The user passes BTF ID of the type it wants to allocates in program BTF,
the verifier then rewrites the first argument as the size of this type,
after performing some sanity checks (to ensure it exists and it is a
struct type).

The second argument is also fixed up and passed by the verifier. This is
the btf_struct_meta for the type being allocated. It would be needed
mostly for the offset array which is required for zero initializing
special fields while leaving the rest of storage in unitialized state.

It would also be needed in the next patch to perform proper destruction
of the object's special fields.

Under the hood, bpf_obj_new will call bpf_mem_alloc and bpf_mem_free,
using the any context BPF memory allocator introduced recently. To this
end, a global instance of the BPF memory allocator is initialized on
boot to be used for this purpose. This 'bpf_global_ma' serves all
allocations for bpf_obj_new. In the future, bpf_obj_new variants will
allow specifying a custom allocator.

Note that now that bpf_obj_new can be used to allocate objects that can
be linked to BPF linked list (when future linked list helpers are
available), we need to also free the elements using bpf_mem_free.
However, since the draining of elements is done outside the
bpf_spin_lock, we need to do migrate_disable around the call since
bpf_list_head_free can be called from map free path where migration is
enabled. Otherwise, when called from BPF programs migration is already
disabled.

A convenience macro is included in the bpf_experimental.h header to hide
over the ugly details of the implementation, leading to user code
looking similar to a language level extension which allocates and
constructs fields of a user type.

struct bar {
	struct bpf_list_node node;
};

struct foo {
	struct bpf_spin_lock lock;
	struct bpf_list_head head __contains(bar, node);
};

void prog(void) {
	struct foo *f;

	f = bpf_obj_new(typeof(*f));
	if (!f)
		return;
	...
}

A key piece of this story is still missing, i.e. the free function,
which will come in the next patch.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221118015614.2013203-14-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                            |  21 +++--
 include/linux/bpf_verifier.h                   |   2 +
 kernel/bpf/core.c                              |  16 ++++
 kernel/bpf/helpers.c                           |  47 ++++++++++--
 kernel/bpf/verifier.c                          | 102 +++++++++++++++++++++++--
 tools/testing/selftests/bpf/bpf_experimental.h |  25 ++++++
 6 files changed, 190 insertions(+), 23 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/bpf_experimental.h

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0a74df731eb8..8b32376ce746 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -54,6 +54,8 @@ struct cgroup;
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
+extern struct bpf_mem_alloc bpf_global_ma;
+extern bool bpf_global_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
@@ -334,16 +336,19 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f
 	return rec->field_mask & type;
 }
 
-static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
+static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj)
 {
-	if (!IS_ERR_OR_NULL(map->record)) {
-		struct btf_field *fields = map->record->fields;
-		u32 cnt = map->record->cnt;
-		int i;
+	int i;
 
-		for (i = 0; i < cnt; i++)
-			memset(dst + fields[i].offset, 0, btf_field_type_size(fields[i].type));
-	}
+	if (!foffs)
+		return;
+	for (i = 0; i < foffs->cnt; i++)
+		memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]);
+}
+
+static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
+{
+	bpf_obj_init(map->field_offs, dst);
 }
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index fb146b0ce006..3dc72d396dfc 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -433,6 +433,8 @@ struct bpf_insn_aux_data {
 		 */
 		struct bpf_loop_inline_state loop_inline_state;
 	};
+	u64 obj_new_size; /* remember the size of type passed to bpf_obj_new to rewrite R1 */
+	struct btf_struct_meta *kptr_struct_meta;
 	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9c16338bcbe8..2e57fc839a5c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -34,6 +34,7 @@
 #include <linux/log2.h>
 #include <linux/bpf_verifier.h>
 #include <linux/nodemask.h>
+#include <linux/bpf_mem_alloc.h>
 
 #include <asm/barrier.h>
 #include <asm/unaligned.h>
@@ -60,6 +61,9 @@
 #define CTX	regs[BPF_REG_CTX]
 #define IMM	insn->imm
 
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
+
 /* No hurry in this branch
  *
  * Exported for the bpf jit load helper.
@@ -2746,6 +2750,18 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len)
 	return -ENOTSUPP;
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+static int __init bpf_global_ma_init(void)
+{
+	int ret;
+
+	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
+	bpf_global_ma_set = !ret;
+	return ret;
+}
+late_initcall(bpf_global_ma_init);
+#endif
+
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5bc0b9f0f306..c4f1c22cc44c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -19,6 +19,7 @@
 #include <linux/proc_ns.h>
 #include <linux/security.h>
 #include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
 
 #include "../../lib/kstrtox.h"
 
@@ -1735,25 +1736,57 @@ unlock:
 
 		obj -= field->list_head.node_offset;
 		head = head->next;
-		/* TODO: Rework later */
-		kfree(obj);
+		/* The contained type can also have resources, including a
+		 * bpf_list_head which needs to be freed.
+		 */
+		bpf_obj_free_fields(field->list_head.value_rec, obj);
+		/* bpf_mem_free requires migrate_disable(), since we can be
+		 * called from map free path as well apart from BPF program (as
+		 * part of map ops doing bpf_obj_free_fields).
+		 */
+		migrate_disable();
+		bpf_mem_free(&bpf_global_ma, obj);
+		migrate_enable();
 	}
 }
 
-BTF_SET8_START(tracing_btf_ids)
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+
+void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+	struct btf_struct_meta *meta = meta__ign;
+	u64 size = local_type_id__k;
+	void *p;
+
+	if (unlikely(!bpf_global_ma_set))
+		return NULL;
+	p = bpf_mem_alloc(&bpf_global_ma, size);
+	if (!p)
+		return NULL;
+	if (meta)
+		bpf_obj_init(meta->field_offs, p);
+	return p;
+}
+
+__diag_pop();
+
+BTF_SET8_START(generic_btf_ids)
 #ifdef CONFIG_KEXEC_CORE
 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
 #endif
-BTF_SET8_END(tracing_btf_ids)
+BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_SET8_END(generic_btf_ids)
 
-static const struct btf_kfunc_id_set tracing_kfunc_set = {
+static const struct btf_kfunc_id_set generic_kfunc_set = {
 	.owner = THIS_MODULE,
-	.set   = &tracing_btf_ids,
+	.set   = &generic_btf_ids,
 };
 
 static int __init kfunc_init(void)
 {
-	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &tracing_kfunc_set);
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
 }
 
 late_initcall(kfunc_init);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c8fcc2808d99..804f3bca6c08 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7952,6 +7952,11 @@ static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param
 	return __kfunc_param_match_suffix(btf, arg, "__k");
 }
 
+static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
+{
+	return __kfunc_param_match_suffix(btf, arg, "__ign");
+}
+
 static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
 					  const struct btf_param *arg,
 					  const char *name)
@@ -8214,6 +8219,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		int kf_arg_type;
 
 		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+
+		if (is_kfunc_arg_ignore(btf, &args[i]))
+			continue;
+
 		if (btf_type_is_scalar(t)) {
 			if (reg->type != SCALAR_VALUE) {
 				verbose(env, "R%d is not a scalar\n", regno);
@@ -8391,6 +8400,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 	return 0;
 }
 
+enum special_kfunc_type {
+	KF_bpf_obj_new_impl,
+};
+
+BTF_SET_START(special_kfunc_set)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_SET_END(special_kfunc_set)
+
+BTF_ID_LIST(special_kfunc_list)
+BTF_ID(func, bpf_obj_new_impl)
+
 static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    int *insn_idx_p)
 {
@@ -8465,17 +8485,59 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
 
 	if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
-		verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
-		return -EINVAL;
+		/* Only exception is bpf_obj_new_impl */
+		if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) {
+			verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+			return -EINVAL;
+		}
 	}
 
 	if (btf_type_is_scalar(t)) {
 		mark_reg_unknown(env, regs, BPF_REG_0);
 		mark_btf_func_reg_size(env, BPF_REG_0, t->size);
 	} else if (btf_type_is_ptr(t)) {
-		ptr_type = btf_type_skip_modifiers(desc_btf, t->type,
-						   &ptr_type_id);
-		if (!btf_type_is_struct(ptr_type)) {
+		ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
+
+		if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+			if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+				const struct btf_type *ret_t;
+				struct btf *ret_btf;
+				u32 ret_btf_id;
+
+				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
+					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
+					return -EINVAL;
+				}
+
+				ret_btf = env->prog->aux->btf;
+				ret_btf_id = meta.arg_constant.value;
+
+				/* This may be NULL due to user not supplying a BTF */
+				if (!ret_btf) {
+					verbose(env, "bpf_obj_new requires prog BTF\n");
+					return -EINVAL;
+				}
+
+				ret_t = btf_type_by_id(ret_btf, ret_btf_id);
+				if (!ret_t || !__btf_type_is_struct(ret_t)) {
+					verbose(env, "bpf_obj_new type ID argument must be of a struct\n");
+					return -EINVAL;
+				}
+
+				mark_reg_known_zero(env, regs, BPF_REG_0);
+				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+				regs[BPF_REG_0].btf = ret_btf;
+				regs[BPF_REG_0].btf_id = ret_btf_id;
+
+				env->insn_aux_data[insn_idx].obj_new_size = ret_t->size;
+				env->insn_aux_data[insn_idx].kptr_struct_meta =
+					btf_find_struct_meta(ret_btf, ret_btf_id);
+			} else {
+				verbose(env, "kernel function %s unhandled dynamic return type\n",
+					meta.func_name);
+				return -EFAULT;
+			}
+		} else if (!__btf_type_is_struct(ptr_type)) {
 			if (!meta.r0_size) {
 				ptr_type_name = btf_name_by_offset(desc_btf,
 								   ptr_type->name_off);
@@ -8503,6 +8565,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			regs[BPF_REG_0].type = PTR_TO_BTF_ID;
 			regs[BPF_REG_0].btf_id = ptr_type_id;
 		}
+
 		if (is_kfunc_ret_null(&meta)) {
 			regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
 			/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
@@ -14671,8 +14734,8 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 	return err;
 }
 
-static int fixup_kfunc_call(struct bpf_verifier_env *env,
-			    struct bpf_insn *insn)
+static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)
 {
 	const struct bpf_kfunc_desc *desc;
 
@@ -14691,8 +14754,21 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
 		return -EFAULT;
 	}
 
+	*cnt = 0;
 	insn->imm = desc->imm;
+	if (insn->off)
+		return 0;
+	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+		struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+		struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+		u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
 
+		insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
+		insn_buf[1] = addr[0];
+		insn_buf[2] = addr[1];
+		insn_buf[3] = *insn;
+		*cnt = 4;
+	}
 	return 0;
 }
 
@@ -14834,9 +14910,19 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		if (insn->src_reg == BPF_PSEUDO_CALL)
 			continue;
 		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
-			ret = fixup_kfunc_call(env, insn);
+			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
 			if (ret)
 				return ret;
+			if (cnt == 0)
+				continue;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta	 += cnt - 1;
+			env->prog = prog = new_prog;
+			insn	  = new_prog->insnsi + i + delta;
 			continue;
 		}
 
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
new file mode 100644
index 000000000000..aeb6a7fcb7c4
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -0,0 +1,25 @@
+#ifndef __BPF_EXPERIMENTAL__
+#define __BPF_EXPERIMENTAL__
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+/* Description
+ *	Allocates an object of the type represented by 'local_type_id' in
+ *	program BTF. User may use the bpf_core_type_id_local macro to pass the
+ *	type ID of a struct in program BTF.
+ *
+ *	The 'local_type_id' parameter must be a known constant.
+ *	The 'meta' parameter is a hidden argument that is ignored.
+ * Returns
+ *	A pointer to an object of the type corresponding to the passed in
+ *	'local_type_id', or NULL on failure.
+ */
+extern void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
+
+/* Convenience macro to wrap over bpf_obj_new_impl */
+#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL))
+
+#endif
-- 
cgit v1.2.3


From 534e86bc6c66e1e0c798a1c0a6a680bb231c08db Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 18 Nov 2022 07:26:07 +0530
Subject: bpf: Add 'release on unlock' logic for bpf_list_push_{front,back}

This commit implements the delayed release logic for bpf_list_push_front
and bpf_list_push_back.

Once a node has been added to the list, it's pointer changes to
PTR_UNTRUSTED. However, it is only released once the lock protecting the
list is unlocked. For such PTR_TO_BTF_ID | MEM_ALLOC with PTR_UNTRUSTED
set but an active ref_obj_id, it is still permitted to read them as long
as the lock is held. Writing to them is not allowed.

This allows having read access to push items we no longer own until we
release the lock guarding the list, allowing a little more flexibility
when working with these APIs.

Note that enabling write support has fairly tricky interactions with
what happens inside the critical section. Just as an example, currently,
bpf_obj_drop is not permitted, but if it were, being able to write to
the PTR_UNTRUSTED pointer while the object gets released back to the
memory allocator would violate safety properties we wish to guarantee
(i.e. not crashing the kernel). The memory could be reused for a
different type in the BPF program or even in the kernel as it gets
eventually kfree'd.

Not enabling bpf_obj_drop inside the critical section would appear to
prevent all of the above, but that is more of an artifical limitation
right now. Since the write support is tangled with how we handle
potential aliasing of nodes inside the critical section that may or may
not be part of the list anymore, it has been deferred to a future patch.

Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221118015614.2013203-18-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  5 ++++
 kernel/bpf/verifier.c        | 58 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3dc72d396dfc..23f30c685f28 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -223,6 +223,11 @@ struct bpf_reference_state {
 	 * exiting a callback function.
 	 */
 	int callback_ref;
+	/* Mark the reference state to release the registers sharing the same id
+	 * on bpf_spin_unlock (for nodes that we will lose ownership to but are
+	 * safe to access inside the critical section).
+	 */
+	bool release_on_unlock;
 };
 
 /* state of the program:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1364df74129e..195d24316750 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5652,7 +5652,9 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 			cur->active_lock.ptr = btf;
 		cur->active_lock.id = reg->id;
 	} else {
+		struct bpf_func_state *fstate = cur_func(env);
 		void *ptr;
+		int i;
 
 		if (map)
 			ptr = map;
@@ -5670,6 +5672,23 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 		}
 		cur->active_lock.ptr = NULL;
 		cur->active_lock.id = 0;
+
+		for (i = 0; i < fstate->acquired_refs; i++) {
+			int err;
+
+			/* Complain on error because this reference state cannot
+			 * be freed before this point, as bpf_spin_lock critical
+			 * section does not allow functions that release the
+			 * allocated object immediately.
+			 */
+			if (!fstate->refs[i].release_on_unlock)
+				continue;
+			err = release_reference(env, fstate->refs[i].id);
+			if (err) {
+				verbose(env, "failed to release release_on_unlock reference");
+				return err;
+			}
+		}
 	}
 	return 0;
 }
@@ -8260,6 +8279,42 @@ static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int ref_set_release_on_unlock(struct bpf_verifier_env *env, u32 ref_obj_id)
+{
+	struct bpf_func_state *state = cur_func(env);
+	struct bpf_reg_state *reg;
+	int i;
+
+	/* bpf_spin_lock only allows calling list_push and list_pop, no BPF
+	 * subprogs, no global functions. This means that the references would
+	 * not be released inside the critical section but they may be added to
+	 * the reference state, and the acquired_refs are never copied out for a
+	 * different frame as BPF to BPF calls don't work in bpf_spin_lock
+	 * critical sections.
+	 */
+	if (!ref_obj_id) {
+		verbose(env, "verifier internal error: ref_obj_id is zero for release_on_unlock\n");
+		return -EFAULT;
+	}
+	for (i = 0; i < state->acquired_refs; i++) {
+		if (state->refs[i].id == ref_obj_id) {
+			if (state->refs[i].release_on_unlock) {
+				verbose(env, "verifier internal error: expected false release_on_unlock");
+				return -EFAULT;
+			}
+			state->refs[i].release_on_unlock = true;
+			/* Now mark everyone sharing same ref_obj_id as untrusted */
+			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+				if (reg->ref_obj_id == ref_obj_id)
+					reg->type |= PTR_UNTRUSTED;
+			}));
+			return 0;
+		}
+	}
+	verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");
+	return -EFAULT;
+}
+
 /* Implementation details:
  *
  * Each register points to some region of memory, which we define as an
@@ -8433,7 +8488,8 @@ static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
 			btf_name_by_offset(field->list_head.btf, et->name_off));
 		return -EINVAL;
 	}
-	return 0;
+	/* Set arg#1 for expiration after unlock */
+	return ref_set_release_on_unlock(env, reg->ref_obj_id);
 }
 
 static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta)
-- 
cgit v1.2.3


From f20a0a0519f35a3a34236ed2d38b067c5cb22b9f Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Thu, 17 Nov 2022 02:18:28 +0900
Subject: ethtool: doc: clarify what drivers can implement in their
 get_drvinfo()

Many of the drivers which implement ethtool_ops::get_drvinfo() will
prints the .driver, .version or .bus_info of struct ethtool_drvinfo.
To have a glance of current state, do:

  $ git grep -W "get_drvinfo(struct"

Printing in those three fields is useless because:

  - since [1], the driver version should be the kernel version (at
    least for upstream drivers). Arguably, out of tree drivers might
    still want to set a custom version, but out of tree is not our
    focus.

  - since [2], the core is able to provide default values for .driver
    and .bus_info.

In summary, drivers may provide .fw_version and .erom_version, the
rest is expected to be done by the core.

In struct ethtool_ops doc from linux/ethtool: rephrase field
get_drvinfo() doc to discourage developers from implementing this
callback.

In struct ethtool_drvinfo doc from uapi/linux/ethtool.h: remove the
paragraph mentioning what drivers should do. Rationale: no need to
repeat what is already written in struct ethtool_ops doc. But add a
note that .fw_version and .erom_version are driver defined.

Also update the dummy driver and simply remove the callback in order
not to confuse the newcomers: most of the drivers will not need this
callback function any more.

[1] commit 6a7e25c7fb48 ("net/core: Replace driver version to be
    kernel version")
Link: https://git.kernel.org/torvalds/linux/c/6a7e25c7fb48

[2] commit edaf5df22cb8 ("ethtool: ethtool_get_drvinfo: populate
    drvinfo fields even if callback exits")
Link: https://git.kernel.org/netdev/net-next/c/edaf5df22cb8

Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/r/20221116171828.4093-1-mailhol.vincent@wanadoo.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dummy.c          |  7 -------
 include/linux/ethtool.h      |  8 ++++----
 include/uapi/linux/ethtool.h | 10 ++++------
 3 files changed, 8 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index aa0fc00faecb..c4b1b0aa438a 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -99,14 +99,7 @@ static const struct net_device_ops dummy_netdev_ops = {
 	.ndo_change_carrier	= dummy_change_carrier,
 };
 
-static void dummy_get_drvinfo(struct net_device *dev,
-			      struct ethtool_drvinfo *info)
-{
-	strscpy(info->driver, DRV_NAME, sizeof(info->driver));
-}
-
 static const struct ethtool_ops dummy_ethtool_ops = {
-	.get_drvinfo            = dummy_get_drvinfo,
 	.get_ts_info		= ethtool_op_get_ts_info,
 };
 
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 5c51c7fda32a..9e0a76fc7de9 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -473,10 +473,10 @@ struct ethtool_module_power_mode_params {
  *	parameter.
  * @supported_coalesce_params: supported types of interrupt coalescing.
  * @supported_ring_params: supported ring params.
- * @get_drvinfo: Report driver/device information.  Should only set the
- *	@driver, @version, @fw_version and @bus_info fields.  If not
- *	implemented, the @driver and @bus_info fields will be filled in
- *	according to the netdev's parent device.
+ * @get_drvinfo: Report driver/device information. Modern drivers no
+ *	longer have to implement this callback. Most fields are
+ *	correctly filled in by the core using system information, or
+ *	populated using other driver operations.
  * @get_regs_len: Get buffer length required for @get_regs
  * @get_regs: Get device registers
  * @get_wol: Report whether Wake-on-Lan is enabled
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f341de2ae612..58e587ba0450 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -159,8 +159,10 @@ static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep)
  *	in its bus driver structure (e.g. pci_driver::name).  Must
  *	not be an empty string.
  * @version: Driver version string; may be an empty string
- * @fw_version: Firmware version string; may be an empty string
- * @erom_version: Expansion ROM version string; may be an empty string
+ * @fw_version: Firmware version string; driver defined; may be an
+ *	empty string
+ * @erom_version: Expansion ROM version string; driver defined; may be
+ *	an empty string
  * @bus_info: Device bus address.  This should match the dev_name()
  *	string for the underlying bus device, if there is one.  May be
  *	an empty string.
@@ -179,10 +181,6 @@ static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep)
  *
  * Users can use the %ETHTOOL_GSSET_INFO command to get the number of
  * strings in any string set (from Linux 2.6.34).
- *
- * Drivers should set at most @driver, @version, @fw_version and
- * @bus_info in their get_drvinfo() implementation.  The ethtool
- * core fills in the other fields using other driver operations.
  */
 struct ethtool_drvinfo {
 	__u32	cmd;
-- 
cgit v1.2.3


From 647541ea06a7bbbcf941e501c726b3e26328c102 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 15 Nov 2022 10:40:21 -0500
Subject: sctp: move SCTP_PAD4 and SCTP_TRUNC4 to linux/sctp.h

Move these two macros from net/sctp/sctp.h to linux/sctp.h, so that
it will be enough to include only linux/sctp.h in nft_exthdr.c and
xt_sctp.c. It should not include "net/sctp/sctp.h" if a module does
not have a dependence on SCTP module.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: Saeed Mahameed <saeed@kernel.org>
Link: https://lore.kernel.org/r/ef6468a687f36da06f575c2131cd4612f6b7be88.1668526821.git.lucien.xin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/sctp.h       | 5 +++++
 include/net/sctp/sctp.h    | 5 -----
 net/netfilter/nft_exthdr.c | 1 -
 net/netfilter/xt_sctp.c    | 1 -
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index a86e852507b3..358dc08e0831 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -820,4 +820,9 @@ struct sctp_new_encap_port_hdr {
 	__be16 new_port;
 };
 
+/* Round an int up to the next multiple of 4.  */
+#define SCTP_PAD4(s) (((s)+3)&~3)
+/* Truncate to the previous multiple of 4.  */
+#define SCTP_TRUNC4(s) ((s)&~3)
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index a04999ee99b0..01d904b34cf0 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -67,11 +67,6 @@
 #define SCTP_PROTOSW_FLAG INET_PROTOSW_PERMANENT
 #endif
 
-/* Round an int up to the next multiple of 4.  */
-#define SCTP_PAD4(s) (((s)+3)&~3)
-/* Truncate to the previous multiple of 4.  */
-#define SCTP_TRUNC4(s) ((s)&~3)
-
 /*
  * Function declarations.
  */
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index ed929d0d37ce..a54a7f772cec 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -13,7 +13,6 @@
 #include <linux/sctp.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
-#include <net/sctp/sctp.h>
 #include <net/tcp.h>
 
 struct nft_exthdr {
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index 680015ba7cb6..e8961094a282 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -4,7 +4,6 @@
 #include <linux/skbuff.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
-#include <net/sctp/sctp.h>
 #include <linux/sctp.h>
 
 #include <linux/netfilter/x_tables.h>
-- 
cgit v1.2.3


From ef66c5475d7fb864c2418d3bdd19dee46324624b Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Sat, 19 Nov 2022 23:10:01 -0600
Subject: bpf: Allow multiple modifiers in reg_type_str() prefix

reg_type_str() in the verifier currently only allows a single register
type modifier to be present in the 'prefix' string which is eventually
stored in the env type_str_buf. This currently works fine because there
are no overlapping type modifiers, but once PTR_TRUSTED is added, that
will no longer be the case. This patch updates reg_type_str() to support
having multiple modifiers in the prefix string, and updates the size of
type_str_buf to be 128 bytes.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20221120051004.3605026-2-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  2 +-
 kernel/bpf/verifier.c        | 19 ++++++++-----------
 2 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 23f30c685f28..608dde740fef 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -19,7 +19,7 @@
  */
 #define BPF_MAX_VAR_SIZ	(1 << 29)
 /* size of type_str_buf in bpf_verifier. */
-#define TYPE_STR_BUF_LEN 64
+#define TYPE_STR_BUF_LEN 128
 
 /* Liveness marks, used for registers and spilled-regs (in stack slots).
  * Read marks propagate upwards until they find a write mark; they record that
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 195d24316750..67a6f11d953c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -557,7 +557,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
 static const char *reg_type_str(struct bpf_verifier_env *env,
 				enum bpf_reg_type type)
 {
-	char postfix[16] = {0}, prefix[32] = {0};
+	char postfix[16] = {0}, prefix[64] = {0};
 	static const char * const str[] = {
 		[NOT_INIT]		= "?",
 		[SCALAR_VALUE]		= "scalar",
@@ -589,16 +589,13 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 			strncpy(postfix, "_or_null", 16);
 	}
 
-	if (type & MEM_RDONLY)
-		strncpy(prefix, "rdonly_", 32);
-	if (type & MEM_RINGBUF)
-		strncpy(prefix, "ringbuf_", 32);
-	if (type & MEM_USER)
-		strncpy(prefix, "user_", 32);
-	if (type & MEM_PERCPU)
-		strncpy(prefix, "percpu_", 32);
-	if (type & PTR_UNTRUSTED)
-		strncpy(prefix, "untrusted_", 32);
+	snprintf(prefix, sizeof(prefix), "%s%s%s%s%s",
+		 type & MEM_RDONLY ? "rdonly_" : "",
+		 type & MEM_RINGBUF ? "ringbuf_" : "",
+		 type & MEM_USER ? "user_" : "",
+		 type & MEM_PERCPU ? "percpu_" : "",
+		 type & PTR_UNTRUSTED ? "untrusted_" : ""
+	);
 
 	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
 		 prefix, str[base_type(type)], postfix);
-- 
cgit v1.2.3


From 3f00c52393445ed49aadc1a567aa502c6333b1a1 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Sat, 19 Nov 2022 23:10:02 -0600
Subject: bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs

Kfuncs currently support specifying the KF_TRUSTED_ARGS flag to signal
to the verifier that it should enforce that a BPF program passes it a
"safe", trusted pointer. Currently, "safe" means that the pointer is
either PTR_TO_CTX, or is refcounted. There may be cases, however, where
the kernel passes a BPF program a safe / trusted pointer to an object
that the BPF program wishes to use as a kptr, but because the object
does not yet have a ref_obj_id from the perspective of the verifier, the
program would be unable to pass it to a KF_ACQUIRE | KF_TRUSTED_ARGS
kfunc.

The solution is to expand the set of pointers that are considered
trusted according to KF_TRUSTED_ARGS, so that programs can invoke kfuncs
with these pointers without getting rejected by the verifier.

There is already a PTR_UNTRUSTED flag that is set in some scenarios,
such as when a BPF program reads a kptr directly from a map
without performing a bpf_kptr_xchg() call. These pointers of course can
and should be rejected by the verifier. Unfortunately, however,
PTR_UNTRUSTED does not cover all the cases for safety that need to
be addressed to adequately protect kfuncs. Specifically, pointers
obtained by a BPF program "walking" a struct are _not_ considered
PTR_UNTRUSTED according to BPF. For example, say that we were to add a
kfunc called bpf_task_acquire(), with KF_ACQUIRE | KF_TRUSTED_ARGS, to
acquire a struct task_struct *. If we only used PTR_UNTRUSTED to signal
that a task was unsafe to pass to a kfunc, the verifier would mistakenly
allow the following unsafe BPF program to be loaded:

SEC("tp_btf/task_newtask")
int BPF_PROG(unsafe_acquire_task,
             struct task_struct *task,
             u64 clone_flags)
{
        struct task_struct *acquired, *nested;

        nested = task->last_wakee;

        /* Would not be rejected by the verifier. */
        acquired = bpf_task_acquire(nested);
        if (!acquired)
                return 0;

        bpf_task_release(acquired);
        return 0;
}

To address this, this patch defines a new type flag called PTR_TRUSTED
which tracks whether a PTR_TO_BTF_ID pointer is safe to pass to a
KF_TRUSTED_ARGS kfunc or a BPF helper function. PTR_TRUSTED pointers are
passed directly from the kernel as a tracepoint or struct_ops callback
argument. Any nested pointer that is obtained from walking a PTR_TRUSTED
pointer is no longer PTR_TRUSTED. From the example above, the struct
task_struct *task argument is PTR_TRUSTED, but the 'nested' pointer
obtained from 'task->last_wakee' is not PTR_TRUSTED.

A subsequent patch will add kfuncs for storing a task kfunc as a kptr,
and then another patch will add selftests to validate.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20221120051004.3605026-3-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/kfuncs.rst                       | 30 +++++-----
 include/linux/bpf.h                                | 30 ++++++++++
 include/linux/bpf_verifier.h                       |  7 +++
 include/linux/btf.h                                | 65 ++++++++++++--------
 kernel/bpf/btf.c                                   |  8 +++
 kernel/bpf/verifier.c                              | 69 ++++++++++++++++++----
 kernel/trace/bpf_trace.c                           |  2 +-
 net/ipv4/bpf_tcp_ca.c                              |  4 +-
 tools/testing/selftests/bpf/verifier/calls.c       |  2 +-
 .../testing/selftests/bpf/verifier/ref_tracking.c  |  4 +-
 10 files changed, 164 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 3b1501c3b6cd..90774479ab7a 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -161,22 +161,20 @@ KF_ACQUIRE and KF_RET_NULL flags.
 --------------------------
 
 The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It
-indicates that the all pointer arguments will always have a guaranteed lifetime,
-and pointers to kernel objects are always passed to helpers in their unmodified
-form (as obtained from acquire kfuncs).
-
-It can be used to enforce that a pointer to a refcounted object acquired from a
-kfunc or BPF helper is passed as an argument to this kfunc without any
-modifications (e.g. pointer arithmetic) such that it is trusted and points to
-the original object.
-
-Meanwhile, it is also allowed pass pointers to normal memory to such kfuncs,
-but those can have a non-zero offset.
-
-This flag is often used for kfuncs that operate (change some property, perform
-some operation) on an object that was obtained using an acquire kfunc. Such
-kfuncs need an unchanged pointer to ensure the integrity of the operation being
-performed on the expected object.
+indicates that the all pointer arguments are valid, and that all pointers to
+BTF objects have been passed in their unmodified form (that is, at a zero
+offset, and without having been obtained from walking another pointer).
+
+There are two types of pointers to kernel objects which are considered "valid":
+
+1. Pointers which are passed as tracepoint or struct_ops callback arguments.
+2. Pointers which were returned from a KF_ACQUIRE or KF_KPTR_GET kfunc.
+
+Pointers to non-BTF objects (e.g. scalar pointers) may also be passed to
+KF_TRUSTED_ARGS kfuncs, and may have a non-zero offset.
+
+The definition of "valid" pointers is subject to change at any time, and has
+absolutely no ABI stability guarantees.
 
 2.4.6 KF_SLEEPABLE flag
 -----------------------
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8b32376ce746..c9eafa67f2a2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -543,6 +543,35 @@ enum bpf_type_flag {
 	 */
 	MEM_ALLOC		= BIT(11 + BPF_BASE_TYPE_BITS),
 
+	/* PTR was passed from the kernel in a trusted context, and may be
+	 * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions.
+	 * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above.
+	 * PTR_UNTRUSTED refers to a kptr that was read directly from a map
+	 * without invoking bpf_kptr_xchg(). What we really need to know is
+	 * whether a pointer is safe to pass to a kfunc or BPF helper function.
+	 * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF
+	 * helpers, they do not cover all possible instances of unsafe
+	 * pointers. For example, a pointer that was obtained from walking a
+	 * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the
+	 * fact that it may be NULL, invalid, etc. This is due to backwards
+	 * compatibility requirements, as this was the behavior that was first
+	 * introduced when kptrs were added. The behavior is now considered
+	 * deprecated, and PTR_UNTRUSTED will eventually be removed.
+	 *
+	 * PTR_TRUSTED, on the other hand, is a pointer that the kernel
+	 * guarantees to be valid and safe to pass to kfuncs and BPF helpers.
+	 * For example, pointers passed to tracepoint arguments are considered
+	 * PTR_TRUSTED, as are pointers that are passed to struct_ops
+	 * callbacks. As alluded to above, pointers that are obtained from
+	 * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a
+	 * struct task_struct *task is PTR_TRUSTED, then accessing
+	 * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored
+	 * in a BPF register. Similarly, pointers passed to certain programs
+	 * types such as kretprobes are not guaranteed to be valid, as they may
+	 * for example contain an object that was recently freed.
+	 */
+	PTR_TRUSTED		= BIT(12 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
@@ -636,6 +665,7 @@ enum bpf_return_type {
 	RET_PTR_TO_RINGBUF_MEM_OR_NULL	= PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM,
 	RET_PTR_TO_DYNPTR_MEM_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_MEM,
 	RET_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
+	RET_PTR_TO_BTF_ID_TRUSTED	= PTR_TRUSTED	 | RET_PTR_TO_BTF_ID,
 
 	/* This must be the last entry. Its purpose is to ensure the enum is
 	 * wide enough to hold the higher bits reserved for bpf_type_flag.
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 608dde740fef..545152ac136c 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -680,4 +680,11 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
 	}
 }
 
+#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED)
+
+static inline bool bpf_type_has_unsafe_modifiers(u32 type)
+{
+	return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS;
+}
+
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/include/linux/btf.h b/include/linux/btf.h
index d5b26380a60f..d38aa4251c28 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -19,36 +19,53 @@
 #define KF_RELEASE	(1 << 1) /* kfunc is a release function */
 #define KF_RET_NULL	(1 << 2) /* kfunc returns a pointer that may be NULL */
 #define KF_KPTR_GET	(1 << 3) /* kfunc returns reference to a kptr */
-/* Trusted arguments are those which are meant to be referenced arguments with
- * unchanged offset. It is used to enforce that pointers obtained from acquire
- * kfuncs remain unmodified when being passed to helpers taking trusted args.
+/* Trusted arguments are those which are guaranteed to be valid when passed to
+ * the kfunc. It is used to enforce that pointers obtained from either acquire
+ * kfuncs, or from the main kernel on a tracepoint or struct_ops callback
+ * invocation, remain unmodified when being passed to helpers taking trusted
+ * args.
  *
- * Consider
- *	struct foo {
- *		int data;
- *		struct foo *next;
- *	};
+ * Consider, for example, the following new task tracepoint:
  *
- *	struct bar {
- *		int data;
- *		struct foo f;
- *	};
+ *	SEC("tp_btf/task_newtask")
+ *	int BPF_PROG(new_task_tp, struct task_struct *task, u64 clone_flags)
+ *	{
+ *		...
+ *	}
  *
- *	struct foo *f = alloc_foo(); // Acquire kfunc
- *	struct bar *b = alloc_bar(); // Acquire kfunc
+ * And the following kfunc:
  *
- * If a kfunc set_foo_data() wants to operate only on the allocated object, it
- * will set the KF_TRUSTED_ARGS flag, which will prevent unsafe usage like:
+ *	BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
  *
- *	set_foo_data(f, 42);	   // Allowed
- *	set_foo_data(f->next, 42); // Rejected, non-referenced pointer
- *	set_foo_data(&f->next, 42);// Rejected, referenced, but wrong type
- *	set_foo_data(&b->f, 42);   // Rejected, referenced, but bad offset
+ * All invocations to the kfunc must pass the unmodified, unwalked task:
  *
- * In the final case, usually for the purposes of type matching, it is deduced
- * by looking at the type of the member at the offset, but due to the
- * requirement of trusted argument, this deduction will be strict and not done
- * for this case.
+ *	bpf_task_acquire(task);		    // Allowed
+ *	bpf_task_acquire(task->last_wakee); // Rejected, walked task
+ *
+ * Programs may also pass referenced tasks directly to the kfunc:
+ *
+ *	struct task_struct *acquired;
+ *
+ *	acquired = bpf_task_acquire(task);	// Allowed, same as above
+ *	bpf_task_acquire(acquired);		// Allowed
+ *	bpf_task_acquire(task);			// Allowed
+ *	bpf_task_acquire(acquired->last_wakee); // Rejected, walked task
+ *
+ * Programs may _not_, however, pass a task from an arbitrary fentry/fexit, or
+ * kprobe/kretprobe to the kfunc, as BPF cannot guarantee that all of these
+ * pointers are guaranteed to be safe. For example, the following BPF program
+ * would be rejected:
+ *
+ * SEC("kretprobe/free_task")
+ * int BPF_PROG(free_task_probe, struct task_struct *tsk)
+ * {
+ *	struct task_struct *acquired;
+ *
+ *	acquired = bpf_task_acquire(acquired); // Rejected, not a trusted pointer
+ *	bpf_task_release(acquired);
+ *
+ *	return 0;
+ * }
  */
 #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f7d5fab61535..d52054ec69c9 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5799,6 +5799,11 @@ static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
 	return nr_args + 1;
 }
 
+static bool prog_type_args_trusted(enum bpf_prog_type prog_type)
+{
+	return prog_type == BPF_PROG_TYPE_TRACING || prog_type == BPF_PROG_TYPE_STRUCT_OPS;
+}
+
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info)
@@ -5942,6 +5947,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	}
 
 	info->reg_type = PTR_TO_BTF_ID;
+	if (prog_type_args_trusted(prog->type))
+		info->reg_type |= PTR_TRUSTED;
+
 	if (tgt_prog) {
 		enum bpf_prog_type tgt_type;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 67a6f11d953c..5bc9d84d7924 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -589,12 +589,13 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 			strncpy(postfix, "_or_null", 16);
 	}
 
-	snprintf(prefix, sizeof(prefix), "%s%s%s%s%s",
+	snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s",
 		 type & MEM_RDONLY ? "rdonly_" : "",
 		 type & MEM_RINGBUF ? "ringbuf_" : "",
 		 type & MEM_USER ? "user_" : "",
 		 type & MEM_PERCPU ? "percpu_" : "",
-		 type & PTR_UNTRUSTED ? "untrusted_" : ""
+		 type & PTR_UNTRUSTED ? "untrusted_" : "",
+		 type & PTR_TRUSTED ? "trusted_" : ""
 	);
 
 	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
@@ -3856,7 +3857,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 			       struct bpf_reg_state *reg, u32 regno)
 {
 	const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
-	int perm_flags = PTR_MAYBE_NULL;
+	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
 	const char *reg_name = "";
 
 	/* Only unreferenced case accepts untrusted pointers */
@@ -4732,6 +4733,9 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	if (type_flag(reg->type) & PTR_UNTRUSTED)
 		flag |= PTR_UNTRUSTED;
 
+	/* Any pointer obtained from walking a trusted pointer is no longer trusted. */
+	flag &= ~PTR_TRUSTED;
+
 	if (atype == BPF_READ && value_regno >= 0)
 		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
 
@@ -5844,6 +5848,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = {
 		PTR_TO_TCP_SOCK,
 		PTR_TO_XDP_SOCK,
 		PTR_TO_BTF_ID,
+		PTR_TO_BTF_ID | PTR_TRUSTED,
 	},
 	.btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
 };
@@ -5884,8 +5889,18 @@ static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
 static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
 static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
 static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
-static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
-static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } };
+static const struct bpf_reg_types btf_ptr_types = {
+	.types = {
+		PTR_TO_BTF_ID,
+		PTR_TO_BTF_ID | PTR_TRUSTED,
+	},
+};
+static const struct bpf_reg_types percpu_btf_ptr_types = {
+	.types = {
+		PTR_TO_BTF_ID | MEM_PERCPU,
+		PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
+	}
+};
 static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
 static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
 static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
@@ -5973,7 +5988,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 	return -EACCES;
 
 found:
-	if (reg->type == PTR_TO_BTF_ID) {
+	if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) {
 		/* For bpf_sk_release, it needs to match against first member
 		 * 'struct sock_common', hence make an exception for it. This
 		 * allows bpf_sk_release to work for multiple socket types.
@@ -6055,6 +6070,8 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	 */
 	case PTR_TO_BTF_ID:
 	case PTR_TO_BTF_ID | MEM_ALLOC:
+	case PTR_TO_BTF_ID | PTR_TRUSTED:
+	case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
 		/* When referenced PTR_TO_BTF_ID is passed to release function,
 		 * it's fixed offset must be 0.	In the other cases, fixed offset
 		 * can be non-zero.
@@ -7939,6 +7956,25 @@ static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
 	return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
 }
 
+static bool is_trusted_reg(const struct bpf_reg_state *reg)
+{
+	/* A referenced register is always trusted. */
+	if (reg->ref_obj_id)
+		return true;
+
+	/* If a register is not referenced, it is trusted if it has either the
+	 * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
+	 * other type modifiers may be safe, but we elect to take an opt-in
+	 * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
+	 * not.
+	 *
+	 * Eventually, we should make PTR_TRUSTED the single source of truth
+	 * for whether a register is trusted.
+	 */
+	return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
+	       !bpf_type_has_unsafe_modifiers(reg->type);
+}
+
 static bool __kfunc_param_match_suffix(const struct btf *btf,
 				       const struct btf_param *arg,
 				       const char *suffix)
@@ -8220,7 +8256,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
 	const char *reg_ref_tname;
 	u32 reg_ref_id;
 
-	if (reg->type == PTR_TO_BTF_ID) {
+	if (base_type(reg->type) == PTR_TO_BTF_ID) {
 		reg_btf = reg->btf;
 		reg_ref_id = reg->btf_id;
 	} else {
@@ -8366,6 +8402,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
 		ptr = reg->map_ptr;
 		break;
 	case PTR_TO_BTF_ID | MEM_ALLOC:
+	case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
 		ptr = reg->btf;
 		break;
 	default:
@@ -8596,8 +8633,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		case KF_ARG_PTR_TO_BTF_ID:
 			if (!is_kfunc_trusted_args(meta))
 				break;
-			if (!reg->ref_obj_id) {
-				verbose(env, "R%d must be referenced\n", regno);
+
+			if (!is_trusted_reg(reg)) {
+				verbose(env, "R%d must be referenced or trusted\n", regno);
 				return -EINVAL;
 			}
 			fallthrough;
@@ -8702,9 +8740,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			break;
 		case KF_ARG_PTR_TO_BTF_ID:
 			/* Only base_type is checked, further checks are done here */
-			if (reg->type != PTR_TO_BTF_ID &&
-			    (!reg2btf_ids[base_type(reg->type)] || type_flag(reg->type))) {
-				verbose(env, "arg#%d expected pointer to btf or socket\n", i);
+			if ((base_type(reg->type) != PTR_TO_BTF_ID ||
+			     bpf_type_has_unsafe_modifiers(reg->type)) &&
+			    !reg2btf_ids[base_type(reg->type)]) {
+				verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
+				verbose(env, "expected %s or socket\n",
+					reg_type_str(env, base_type(reg->type) |
+							  (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
 				return -EINVAL;
 			}
 			ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
@@ -14713,6 +14755,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			break;
 		case PTR_TO_BTF_ID:
 		case PTR_TO_BTF_ID | PTR_UNTRUSTED:
+		case PTR_TO_BTF_ID | PTR_TRUSTED:
 		/* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
 		 * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
 		 * be said once it is marked PTR_UNTRUSTED, hence we must handle
@@ -14720,6 +14763,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		 * for this case.
 		 */
 		case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
+		case PTR_TO_BTF_ID | PTR_UNTRUSTED | PTR_TRUSTED:
+		case PTR_TO_BTF_ID | PTR_UNTRUSTED | MEM_ALLOC | PTR_TRUSTED:
 			if (type == BPF_READ) {
 				insn->code = BPF_LDX | BPF_PROBE_MEM |
 					BPF_SIZE((insn)->code);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f2d8d070d024..5b9008bc597b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -774,7 +774,7 @@ BPF_CALL_0(bpf_get_current_task_btf)
 const struct bpf_func_proto bpf_get_current_task_btf_proto = {
 	.func		= bpf_get_current_task_btf,
 	.gpl_only	= true,
-	.ret_type	= RET_PTR_TO_BTF_ID,
+	.ret_type	= RET_PTR_TO_BTF_ID_TRUSTED,
 	.ret_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
 };
 
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index d15c91de995f..4517d2bd186a 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -61,7 +61,9 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
 	if (!bpf_tracing_btf_ctx_access(off, size, type, prog, info))
 		return false;
 
-	if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id)
+	if (base_type(info->reg_type) == PTR_TO_BTF_ID &&
+	    !bpf_type_has_unsafe_modifiers(info->reg_type) &&
+	    info->btf_id == sock_id)
 		/* promote it to tcp_sock */
 		info->btf_id = tcp_sock_id;
 
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 86d6fef2e3b4..3193915c5ee6 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -109,7 +109,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "arg#0 expected pointer to btf or socket",
+	.errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_acquire", 3 },
 		{ "bpf_kfunc_call_test_release", 5 },
diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c
index 55cba01c99d5..9540164712b7 100644
--- a/tools/testing/selftests/bpf/verifier/ref_tracking.c
+++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c
@@ -142,7 +142,7 @@
 	.kfunc = "bpf",
 	.expected_attach_type = BPF_LSM_MAC,
 	.flags = BPF_F_SLEEPABLE,
-	.errstr = "arg#0 expected pointer to btf or socket",
+	.errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_lookup_user_key", 2 },
 		{ "bpf_key_put", 4 },
@@ -163,7 +163,7 @@
 	.kfunc = "bpf",
 	.expected_attach_type = BPF_LSM_MAC,
 	.flags = BPF_F_SLEEPABLE,
-	.errstr = "arg#0 expected pointer to btf or socket",
+	.errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_lookup_system_key", 1 },
 		{ "bpf_key_put", 3 },
-- 
cgit v1.2.3


From fd264ca020948a743e4c36731dfdecc4a812153c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 20 Nov 2022 11:54:32 -0800
Subject: bpf: Add a kfunc to type cast from bpf uapi ctx to kernel ctx

Implement bpf_cast_to_kern_ctx() kfunc which does a type cast
of a uapi ctx object to the corresponding kernel ctx. Previously
if users want to access some data available in kctx but not
in uapi ctx, bpf_probe_read_kernel() helper is needed.
The introduction of bpf_cast_to_kern_ctx() allows direct
memory access which makes code simpler and easier to understand.

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20221120195432.3113982-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h   |  5 +++++
 kernel/bpf/btf.c      | 20 ++++++++++++++++++++
 kernel/bpf/helpers.c  |  6 ++++++
 kernel/bpf/verifier.c | 22 ++++++++++++++++++++++
 4 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index d38aa4251c28..9ed00077db6e 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -487,6 +487,7 @@ const struct btf_member *
 btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 		      const struct btf_type *t, enum bpf_prog_type prog_type,
 		      int arg);
+int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type);
 bool btf_types_are_same(const struct btf *btf1, u32 id1,
 			const struct btf *btf2, u32 id2);
 #else
@@ -531,6 +532,10 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 {
 	return NULL;
 }
+static inline int get_kern_ctx_btf_id(struct bpf_verifier_log *log,
+				      enum bpf_prog_type prog_type) {
+	return -EINVAL;
+}
 static inline bool btf_types_are_same(const struct btf *btf1, u32 id1,
 				      const struct btf *btf2, u32 id2)
 {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1c78d4df9e18..1a59cc7ad730 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5603,6 +5603,26 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
 	return kern_ctx_type->type;
 }
 
+int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
+{
+	const struct btf_member *kctx_member;
+	const struct btf_type *conv_struct;
+	const struct btf_type *kctx_type;
+	u32 kctx_type_id;
+
+	conv_struct = bpf_ctx_convert.t;
+	/* get member for kernel ctx type */
+	kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+	kctx_type_id = kctx_member->type;
+	kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id);
+	if (!btf_type_is_struct(kctx_type)) {
+		bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id);
+		return -EINVAL;
+	}
+
+	return kctx_type_id;
+}
+
 BTF_ID_LIST(bpf_ctx_convert_btf_id)
 BTF_ID(struct, bpf_ctx_convert)
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8e0d80b6bf09..e45e72676f5a 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1879,6 +1879,11 @@ void bpf_task_release(struct task_struct *p)
 	put_task_struct_rcu_user(p);
 }
 
+void *bpf_cast_to_kern_ctx(void *obj)
+{
+	return obj;
+}
+
 __diag_pop();
 
 BTF_SET8_START(generic_btf_ids)
@@ -1907,6 +1912,7 @@ BTF_ID(struct, task_struct)
 BTF_ID(func, bpf_task_release)
 
 BTF_SET8_START(common_btf_ids)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eb090af35477..d63cc0fb0a9c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7907,6 +7907,7 @@ struct bpf_kfunc_call_arg_meta {
 	u32 ref_obj_id;
 	u8 release_regno;
 	bool r0_rdonly;
+	u32 ret_btf_id;
 	u64 r0_size;
 	struct {
 		u64 value;
@@ -8151,6 +8152,7 @@ enum special_kfunc_type {
 	KF_bpf_list_push_back,
 	KF_bpf_list_pop_front,
 	KF_bpf_list_pop_back,
+	KF_bpf_cast_to_kern_ctx,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -8160,6 +8162,7 @@ BTF_ID(func, bpf_list_push_front)
 BTF_ID(func, bpf_list_push_back)
 BTF_ID(func, bpf_list_pop_front)
 BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -8169,6 +8172,7 @@ BTF_ID(func, bpf_list_push_front)
 BTF_ID(func, bpf_list_push_back)
 BTF_ID(func, bpf_list_pop_front)
 BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
 
 static enum kfunc_ptr_arg_type
 get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
@@ -8182,6 +8186,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	struct bpf_reg_state *reg = &regs[regno];
 	bool arg_mem_size = false;
 
+	if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
+		return KF_ARG_PTR_TO_CTX;
+
 	/* In this function, we verify the kfunc's BTF as per the argument type,
 	 * leaving the rest of the verification with respect to the register
 	 * type to our caller. When a set of conditions hold in the BTF type of
@@ -8668,6 +8675,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
 				return -EINVAL;
 			}
+
+			if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+				ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog));
+				if (ret < 0)
+					return -EINVAL;
+				meta->ret_btf_id  = ret;
+			}
 			break;
 		case KF_ARG_PTR_TO_ALLOC_BTF_ID:
 			if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
@@ -8922,6 +8936,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				regs[BPF_REG_0].btf = field->list_head.btf;
 				regs[BPF_REG_0].btf_id = field->list_head.value_btf_id;
 				regs[BPF_REG_0].off = field->list_head.node_offset;
+			} else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+				mark_reg_known_zero(env, regs, BPF_REG_0);
+				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
+				regs[BPF_REG_0].btf = desc_btf;
+				regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 			} else {
 				verbose(env, "kernel function %s unhandled dynamic return type\n",
 					meta.func_name);
@@ -15175,6 +15194,9 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		insn_buf[1] = addr[1];
 		insn_buf[2] = *insn;
 		*cnt = 3;
+	} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+		*cnt = 1;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 19d05ea712ecbbb67d302664da5ec58b37b9aece Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 21 Nov 2022 15:55:54 +0200
Subject: net: dsa: move tag_8021q headers to their proper place

tag_8021q definitions are all over the place. Some are exported to
linux/dsa/8021q.h (visible by DSA core, taggers, switch drivers and
everyone else), and some are in dsa_priv.h.

Move the structures that don't need external visibility into tag_8021q.c,
and the ones which don't need the world or switch drivers to see them
into tag_8021q.h.

We also have the tag_8021q.h inclusion from switch.c, which is basically
the entire reason why tag_8021q.c was built into DSA in commit
8b6e638b4be2 ("net: dsa: build tag_8021q.c as part of DSA core").
I still don't know how to better deal with that, so leave it alone.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dsa/8021q.h  | 31 +------------------------------
 include/net/dsa.h          |  1 +
 net/dsa/dsa_priv.h         |  8 --------
 net/dsa/port.c             |  1 +
 net/dsa/switch.c           |  1 +
 net/dsa/tag_8021q.c        | 15 +++++++++++++++
 net/dsa/tag_8021q.h        | 27 +++++++++++++++++++++++++++
 net/dsa/tag_ocelot_8021q.c |  1 +
 net/dsa/tag_sja1105.c      |  1 +
 9 files changed, 48 insertions(+), 38 deletions(-)
 create mode 100644 net/dsa/tag_8021q.h

(limited to 'include/linux')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 3ed117e299ec..f3664ee12170 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -5,28 +5,8 @@
 #ifndef _NET_DSA_8021Q_H
 #define _NET_DSA_8021Q_H
 
-#include <linux/refcount.h>
-#include <linux/types.h>
 #include <net/dsa.h>
-
-struct dsa_switch;
-struct dsa_port;
-struct sk_buff;
-struct net_device;
-
-struct dsa_tag_8021q_vlan {
-	struct list_head list;
-	int port;
-	u16 vid;
-	refcount_t refcount;
-};
-
-struct dsa_8021q_context {
-	struct dsa_switch *ds;
-	struct list_head vlans;
-	/* EtherType of RX VID, used for filtering on master interface */
-	__be16 proto;
-};
+#include <linux/types.h>
 
 int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto);
 
@@ -38,15 +18,6 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port,
 void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port,
 				struct dsa_bridge bridge);
 
-struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
-			       u16 tpid, u16 tci);
-
-void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
-		   int *vbid);
-
-struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master,
-						   int vbid);
-
 u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num);
 
 u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index d5bfcb63d4c2..96086289aa9b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -22,6 +22,7 @@
 #include <net/devlink.h>
 #include <net/switchdev.h>
 
+struct dsa_8021q_context;
 struct tc_action;
 struct phy_device;
 struct fixed_phy_status;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index aa685d2309e0..265659954ffd 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -13,15 +13,7 @@
 
 #define DSA_MAX_NUM_OFFLOADING_BRIDGES		BITS_PER_LONG
 
-struct dsa_notifier_tag_8021q_vlan_info;
-
 /* netlink.c */
 extern struct rtnl_link_ops dsa_link_ops __read_mostly;
 
-/* tag_8021q.c */
-int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
-				  struct dsa_notifier_tag_8021q_vlan_info *info);
-int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
-				  struct dsa_notifier_tag_8021q_vlan_info *info);
-
 #endif
diff --git a/net/dsa/port.c b/net/dsa/port.c
index e6d5c05b41b4..67ad1adec2a2 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -16,6 +16,7 @@
 #include "port.h"
 #include "slave.h"
 #include "switch.h"
+#include "tag_8021q.h"
 
 /**
  * dsa_port_notify - Notify the switching fabric of changes to a port
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 4420af0081af..e53cc0c3c933 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -17,6 +17,7 @@
 #include "port.h"
 #include "slave.h"
 #include "switch.h"
+#include "tag_8021q.h"
 
 static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds,
 						   unsigned int ageing_time)
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index abd994dc76d5..ac2eb933106e 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -11,6 +11,7 @@
 #include "port.h"
 #include "switch.h"
 #include "tag.h"
+#include "tag_8021q.h"
 
 /* Binary structure of the fake 12-bit VID field (when the TPID is
  * ETH_P_DSA_8021Q):
@@ -63,6 +64,20 @@
 #define DSA_8021Q_PORT(x)		(((x) << DSA_8021Q_PORT_SHIFT) & \
 						 DSA_8021Q_PORT_MASK)
 
+struct dsa_tag_8021q_vlan {
+	struct list_head list;
+	int port;
+	u16 vid;
+	refcount_t refcount;
+};
+
+struct dsa_8021q_context {
+	struct dsa_switch *ds;
+	struct list_head vlans;
+	/* EtherType of RX VID, used for filtering on master interface */
+	__be16 proto;
+};
+
 u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num)
 {
 	/* The VBID value of 0 is reserved for precise TX, but it is also
diff --git a/net/dsa/tag_8021q.h b/net/dsa/tag_8021q.h
new file mode 100644
index 000000000000..b75cbaa028ef
--- /dev/null
+++ b/net/dsa/tag_8021q.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_TAG_8021Q_H
+#define __DSA_TAG_8021Q_H
+
+#include <net/dsa.h>
+
+#include "switch.h"
+
+struct sk_buff;
+struct net_device;
+
+struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
+			       u16 tpid, u16 tci);
+
+void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
+		   int *vbid);
+
+struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master,
+						   int vbid);
+
+int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
+				  struct dsa_notifier_tag_8021q_vlan_info *info);
+int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
+				  struct dsa_notifier_tag_8021q_vlan_info *info);
+
+#endif
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 7f0c2d71e89b..1f0b8c20eba5 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -12,6 +12,7 @@
 #include <linux/dsa/ocelot.h>
 
 #include "tag.h"
+#include "tag_8021q.h"
 
 #define OCELOT_8021Q_NAME "ocelot-8021q"
 
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 8f581617e15c..f14f51b41491 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -7,6 +7,7 @@
 #include <linux/packing.h>
 
 #include "tag.h"
+#include "tag_8021q.h"
 
 #define SJA1105_NAME				"sja1105"
 #define SJA1110_NAME				"sja1110"
-- 
cgit v1.2.3


From beb3d47d1d3d7185bb401af628ad32ee204a9526 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 23 Nov 2022 07:57:59 -0800
Subject: bpf: Fix a BTF_ID_LIST bug with CONFIG_DEBUG_INFO_BTF not set

With CONFIG_DEBUG_INFO_BTF not set, we hit the following compilation error,
  /.../kernel/bpf/verifier.c:8196:23: error: array index 6 is past the end of the array
  (that has type 'u32[5]' (aka 'unsigned int[5]')) [-Werror,-Warray-bounds]
        if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
                             ^                  ~~~~~~~~~~~~~~~~~~~~~~~
  /.../kernel/bpf/verifier.c:8174:1: note: array 'special_kfunc_list' declared here
  BTF_ID_LIST(special_kfunc_list)
  ^
  /.../include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST'
  #define BTF_ID_LIST(name) static u32 __maybe_unused name[5];
                            ^
  /.../kernel/bpf/verifier.c:8443:19: error: array index 5 is past the end of the array
  (that has type 'u32[5]' (aka 'unsigned int[5]')) [-Werror,-Warray-bounds]
                 btf_id == special_kfunc_list[KF_bpf_list_pop_back];
                           ^                  ~~~~~~~~~~~~~~~~~~~~
  /.../kernel/bpf/verifier.c:8174:1: note: array 'special_kfunc_list' declared here
  BTF_ID_LIST(special_kfunc_list)
  ^
  /.../include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST'
  #define BTF_ID_LIST(name) static u32 __maybe_unused name[5];
  ...

Fix the problem by increase the size of BTF_ID_LIST to 16 to avoid compilation error
and also prevent potentially unintended issue due to out-of-bound access.

Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <error27@gmail.com>
Reported-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20221123155759.2669749-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index c9744efd202f..93397711a68c 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -204,7 +204,7 @@ extern struct btf_id_set8 name;
 
 #else
 
-#define BTF_ID_LIST(name) static u32 __maybe_unused name[5];
+#define BTF_ID_LIST(name) static u32 __maybe_unused name[16];
 #define BTF_ID(prefix, name)
 #define BTF_ID_FLAGS(prefix, name, ...)
 #define BTF_ID_UNUSED
-- 
cgit v1.2.3


From 5a0f663f0189cf9e031e444f97c029717a99548d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 23 Nov 2022 21:32:06 -0800
Subject: compiler_types: Define __rcu as __attribute__((btf_type_tag("rcu")))

Currently, without rcu attribute info in BTF, the verifier treats
rcu tagged pointer as a normal pointer. This might be a problem
for sleepable program where rcu_read_lock()/unlock() is not available.
For example, for a sleepable fentry program, if rcu protected memory
access is interleaved with a sleepable helper/kfunc, it is possible
the memory access after the sleepable helper/kfunc might be invalid
since the object might have been freed then. To prevent such cases,
introducing rcu tagging for memory accesses in verifier can help
to reject such programs.

To enable rcu tagging in BTF, during kernel compilation,
define __rcu as attribute btf_type_tag("rcu") so __rcu information can
be preserved in dwarf and btf, and later can be used for bpf prog verification.

Acked-by: KP Singh <kpsingh@kernel.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20221124053206.2373141-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/compiler_types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index eb0466236661..7c1afe0f4129 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -49,7 +49,8 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
 # endif
 # define __iomem
 # define __percpu	BTF_TYPE_TAG(percpu)
-# define __rcu
+# define __rcu		BTF_TYPE_TAG(rcu)
+
 # define __chk_user_ptr(x)	(void)0
 # define __chk_io_ptr(x)	(void)0
 /* context/locking */
-- 
cgit v1.2.3


From 01685c5bddaa6df3d662c8afed5e5289fcc68e5a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 23 Nov 2022 21:32:11 -0800
Subject: bpf: Introduce might_sleep field in bpf_func_proto

Introduce bpf_func_proto->might_sleep to indicate a particular helper
might sleep. This will make later check whether a helper might be
sleepable or not easier.

Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20221124053211.2373553-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      | 1 +
 kernel/bpf/bpf_lsm.c     | 6 ++++--
 kernel/bpf/helpers.c     | 2 ++
 kernel/bpf/verifier.c    | 5 +++++
 kernel/trace/bpf_trace.c | 4 ++--
 5 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c9eafa67f2a2..43fd7eeeeabb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -682,6 +682,7 @@ struct bpf_func_proto {
 	u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 	bool gpl_only;
 	bool pkt_access;
+	bool might_sleep;
 	enum bpf_return_type ret_type;
 	union {
 		struct {
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index d6c9b3705f24..ae0267f150b5 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -151,6 +151,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
 static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
 	.func		= bpf_ima_inode_hash,
 	.gpl_only	= false,
+	.might_sleep	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_BTF_ID,
 	.arg1_btf_id	= &bpf_ima_inode_hash_btf_ids[0],
@@ -169,6 +170,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
 static const struct bpf_func_proto bpf_ima_file_hash_proto = {
 	.func		= bpf_ima_file_hash,
 	.gpl_only	= false,
+	.might_sleep	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_BTF_ID,
 	.arg1_btf_id	= &bpf_ima_file_hash_btf_ids[0],
@@ -221,9 +223,9 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_bprm_opts_set:
 		return &bpf_bprm_opts_set_proto;
 	case BPF_FUNC_ima_inode_hash:
-		return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+		return &bpf_ima_inode_hash_proto;
 	case BPF_FUNC_ima_file_hash:
-		return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
+		return &bpf_ima_file_hash_proto;
 	case BPF_FUNC_get_attach_cookie:
 		return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
 #ifdef CONFIG_NET
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 2299bb0d89e5..9296b654dbd7 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -661,6 +661,7 @@ BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
 const struct bpf_func_proto bpf_copy_from_user_proto = {
 	.func		= bpf_copy_from_user,
 	.gpl_only	= false,
+	.might_sleep	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
@@ -691,6 +692,7 @@ BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
 const struct bpf_func_proto bpf_copy_from_user_task_proto = {
 	.func		= bpf_copy_from_user_task,
 	.gpl_only	= true,
+	.might_sleep	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9528a066cfa5..068cc885903c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7516,6 +7516,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		return -EINVAL;
 	}
 
+	if (!env->prog->aux->sleepable && fn->might_sleep) {
+		verbose(env, "helper call might sleep in a non-sleepable prog\n");
+		return -EINVAL;
+	}
+
 	/* With LD_ABS/IND some JITs save/restore skb from r1. */
 	changes_data = bpf_helper_changes_pkt_data(fn->func);
 	if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 5b9008bc597b..3bbd3f0c810c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1485,9 +1485,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_task_stack:
 		return &bpf_get_task_stack_proto;
 	case BPF_FUNC_copy_from_user:
-		return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+		return &bpf_copy_from_user_proto;
 	case BPF_FUNC_copy_from_user_task:
-		return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL;
+		return &bpf_copy_from_user_task_proto;
 	case BPF_FUNC_snprintf_btf:
 		return &bpf_snprintf_btf_proto;
 	case BPF_FUNC_per_cpu_ptr:
-- 
cgit v1.2.3


From 9bb00b2895cbfe0ad410457b605d0a72524168c1 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 23 Nov 2022 21:32:17 -0800
Subject: bpf: Add kfunc bpf_rcu_read_lock/unlock()

Add two kfunc's bpf_rcu_read_lock() and bpf_rcu_read_unlock(). These two kfunc's
can be used for all program types. The following is an example about how
rcu pointer are used w.r.t. bpf_rcu_read_lock()/bpf_rcu_read_unlock().

  struct task_struct {
    ...
    struct task_struct              *last_wakee;
    struct task_struct __rcu        *real_parent;
    ...
  };

Let us say prog does 'task = bpf_get_current_task_btf()' to get a
'task' pointer. The basic rules are:
  - 'real_parent = task->real_parent' should be inside bpf_rcu_read_lock
    region. This is to simulate rcu_dereference() operation. The
    'real_parent' is marked as MEM_RCU only if (1). task->real_parent is
    inside bpf_rcu_read_lock region, and (2). task is a trusted ptr. So
    MEM_RCU marked ptr can be 'trusted' inside the bpf_rcu_read_lock region.
  - 'last_wakee = real_parent->last_wakee' should be inside bpf_rcu_read_lock
    region since it tries to access rcu protected memory.
  - the ptr 'last_wakee' will be marked as PTR_UNTRUSTED since in general
    it is not clear whether the object pointed by 'last_wakee' is valid or
    not even inside bpf_rcu_read_lock region.

The verifier will reset all rcu pointer register states to untrusted
at bpf_rcu_read_unlock() kfunc call site, so any such rcu pointer
won't be trusted any more outside the bpf_rcu_read_lock() region.

The current implementation does not support nested rcu read lock
region in the prog.

Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20221124053217.2373910-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |   3 +
 include/linux/bpf_verifier.h |   5 +-
 kernel/bpf/btf.c             |   3 +
 kernel/bpf/helpers.c         |  12 ++++
 kernel/bpf/verifier.c        | 162 +++++++++++++++++++++++++++++++++++--------
 5 files changed, 155 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 43fd7eeeeabb..c6aa6912ea16 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -572,6 +572,9 @@ enum bpf_type_flag {
 	 */
 	PTR_TRUSTED		= BIT(12 + BPF_BASE_TYPE_BITS),
 
+	/* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */
+	MEM_RCU			= BIT(13 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 545152ac136c..c05aa6e1f6f5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -344,6 +344,7 @@ struct bpf_verifier_state {
 		u32 id;
 	} active_lock;
 	bool speculative;
+	bool active_rcu_lock;
 
 	/* first and last insn idx of this verifier state */
 	u32 first_insn_idx;
@@ -445,6 +446,7 @@ struct bpf_insn_aux_data {
 	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
 	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
+	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
@@ -534,6 +536,7 @@ struct bpf_verifier_env {
 	bool bypass_spec_v1;
 	bool bypass_spec_v4;
 	bool seen_direct_write;
+	bool rcu_tag_supported;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
@@ -680,7 +683,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
 	}
 }
 
-#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED)
+#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED)
 
 static inline bool bpf_type_has_unsafe_modifiers(u32 type)
 {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9dbfda2b5c6c..bd3369100239 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6238,6 +6238,9 @@ error:
 				/* check __percpu tag */
 				if (strcmp(tag_value, "percpu") == 0)
 					tmp_flag = MEM_PERCPU;
+				/* check __rcu tag */
+				if (strcmp(tag_value, "rcu") == 0)
+					tmp_flag = MEM_RCU;
 			}
 
 			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9296b654dbd7..a5a511430f2a 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1990,6 +1990,16 @@ void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
 	return obj__ign;
 }
 
+void bpf_rcu_read_lock(void)
+{
+	rcu_read_lock();
+}
+
+void bpf_rcu_read_unlock(void)
+{
+	rcu_read_unlock();
+}
+
 __diag_pop();
 
 BTF_SET8_START(generic_btf_ids)
@@ -2031,6 +2041,8 @@ BTF_ID(func, bpf_cgroup_release)
 BTF_SET8_START(common_btf_ids)
 BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
 BTF_ID_FLAGS(func, bpf_rdonly_cast)
+BTF_ID_FLAGS(func, bpf_rcu_read_lock)
+BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 068cc885903c..f4500479f1c2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -527,6 +527,14 @@ static bool is_callback_calling_function(enum bpf_func_id func_id)
 	       func_id == BPF_FUNC_user_ringbuf_drain;
 }
 
+static bool is_storage_get_function(enum bpf_func_id func_id)
+{
+	return func_id == BPF_FUNC_sk_storage_get ||
+	       func_id == BPF_FUNC_inode_storage_get ||
+	       func_id == BPF_FUNC_task_storage_get ||
+	       func_id == BPF_FUNC_cgrp_storage_get;
+}
+
 static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
 					const struct bpf_map *map)
 {
@@ -589,11 +597,12 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 			strncpy(postfix, "_or_null", 16);
 	}
 
-	snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s",
+	snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
 		 type & MEM_RDONLY ? "rdonly_" : "",
 		 type & MEM_RINGBUF ? "ringbuf_" : "",
 		 type & MEM_USER ? "user_" : "",
 		 type & MEM_PERCPU ? "percpu_" : "",
+		 type & MEM_RCU ? "rcu_" : "",
 		 type & PTR_UNTRUSTED ? "untrusted_" : "",
 		 type & PTR_TRUSTED ? "trusted_" : ""
 	);
@@ -1220,6 +1229,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 		dst_state->frame[i] = NULL;
 	}
 	dst_state->speculative = src->speculative;
+	dst_state->active_rcu_lock = src->active_rcu_lock;
 	dst_state->curframe = src->curframe;
 	dst_state->active_lock.ptr = src->active_lock.ptr;
 	dst_state->active_lock.id = src->active_lock.id;
@@ -4258,6 +4268,25 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
 	return reg->type == PTR_TO_FLOW_KEYS;
 }
 
+static bool is_trusted_reg(const struct bpf_reg_state *reg)
+{
+	/* A referenced register is always trusted. */
+	if (reg->ref_obj_id)
+		return true;
+
+	/* If a register is not referenced, it is trusted if it has the
+	 * MEM_ALLOC, MEM_RCU or PTR_TRUSTED type modifiers, and no others. Some of the
+	 * other type modifiers may be safe, but we elect to take an opt-in
+	 * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
+	 * not.
+	 *
+	 * Eventually, we should make PTR_TRUSTED the single source of truth
+	 * for whether a register is trusted.
+	 */
+	return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
+	       !bpf_type_has_unsafe_modifiers(reg->type);
+}
+
 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
@@ -4737,9 +4766,28 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	if (type_flag(reg->type) & PTR_UNTRUSTED)
 		flag |= PTR_UNTRUSTED;
 
-	/* Any pointer obtained from walking a trusted pointer is no longer trusted. */
+	/* By default any pointer obtained from walking a trusted pointer is
+	 * no longer trusted except the rcu case below.
+	 */
 	flag &= ~PTR_TRUSTED;
 
+	if (flag & MEM_RCU) {
+		/* Mark value register as MEM_RCU only if it is protected by
+		 * bpf_rcu_read_lock() and the ptr reg is trusted. MEM_RCU
+		 * itself can already indicate trustedness inside the rcu
+		 * read lock region. Also mark it as PTR_TRUSTED.
+		 */
+		if (!env->cur_state->active_rcu_lock || !is_trusted_reg(reg))
+			flag &= ~MEM_RCU;
+		else
+			flag |= PTR_TRUSTED;
+	} else if (reg->type & MEM_RCU) {
+		/* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
+		 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
+		 */
+		flag |= PTR_UNTRUSTED;
+	}
+
 	if (atype == BPF_READ && value_regno >= 0)
 		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
 
@@ -5897,6 +5945,7 @@ static const struct bpf_reg_types btf_ptr_types = {
 	.types = {
 		PTR_TO_BTF_ID,
 		PTR_TO_BTF_ID | PTR_TRUSTED,
+		PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED,
 	},
 };
 static const struct bpf_reg_types percpu_btf_ptr_types = {
@@ -6075,6 +6124,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	case PTR_TO_BTF_ID:
 	case PTR_TO_BTF_ID | MEM_ALLOC:
 	case PTR_TO_BTF_ID | PTR_TRUSTED:
+	case PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED:
 	case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
 		/* When referenced PTR_TO_BTF_ID is passed to release function,
 		 * it's fixed offset must be 0.	In the other cases, fixed offset
@@ -7539,6 +7589,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		return err;
 	}
 
+	if (env->cur_state->active_rcu_lock) {
+		if (fn->might_sleep) {
+			verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
+				func_id_name(func_id), func_id);
+			return -EINVAL;
+		}
+
+		if (env->prog->aux->sleepable && is_storage_get_function(func_id))
+			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+	}
+
 	meta.func_id = func_id;
 	/* check args */
 	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -7966,25 +8027,6 @@ static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
 	return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
 }
 
-static bool is_trusted_reg(const struct bpf_reg_state *reg)
-{
-	/* A referenced register is always trusted. */
-	if (reg->ref_obj_id)
-		return true;
-
-	/* If a register is not referenced, it is trusted if it has either the
-	 * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
-	 * other type modifiers may be safe, but we elect to take an opt-in
-	 * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
-	 * not.
-	 *
-	 * Eventually, we should make PTR_TRUSTED the single source of truth
-	 * for whether a register is trusted.
-	 */
-	return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
-	       !bpf_type_has_unsafe_modifiers(reg->type);
-}
-
 static bool __kfunc_param_match_suffix(const struct btf *btf,
 				       const struct btf_param *arg,
 				       const char *suffix)
@@ -8163,6 +8205,8 @@ enum special_kfunc_type {
 	KF_bpf_list_pop_back,
 	KF_bpf_cast_to_kern_ctx,
 	KF_bpf_rdonly_cast,
+	KF_bpf_rcu_read_lock,
+	KF_bpf_rcu_read_unlock,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -8185,6 +8229,18 @@ BTF_ID(func, bpf_list_pop_front)
 BTF_ID(func, bpf_list_pop_back)
 BTF_ID(func, bpf_cast_to_kern_ctx)
 BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rcu_read_lock)
+BTF_ID(func, bpf_rcu_read_unlock)
+
+static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
+}
+
+static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
+}
 
 static enum kfunc_ptr_arg_type
 get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
@@ -8817,6 +8873,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	const struct btf_type *t, *func, *func_proto, *ptr_type;
 	struct bpf_reg_state *regs = cur_regs(env);
 	const char *func_name, *ptr_type_name;
+	bool sleepable, rcu_lock, rcu_unlock;
 	struct bpf_kfunc_call_arg_meta meta;
 	u32 i, nargs, func_id, ptr_type_id;
 	int err, insn_idx = *insn_idx_p;
@@ -8858,11 +8915,45 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EACCES;
 	}
 
-	if (is_kfunc_sleepable(&meta) && !env->prog->aux->sleepable) {
+	sleepable = is_kfunc_sleepable(&meta);
+	if (sleepable && !env->prog->aux->sleepable) {
 		verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
 		return -EACCES;
 	}
 
+	rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
+	rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+	if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) {
+		verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name);
+		return -EACCES;
+	}
+
+	if (env->cur_state->active_rcu_lock) {
+		struct bpf_func_state *state;
+		struct bpf_reg_state *reg;
+
+		if (rcu_lock) {
+			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
+			return -EINVAL;
+		} else if (rcu_unlock) {
+			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+				if (reg->type & MEM_RCU) {
+					reg->type &= ~(MEM_RCU | PTR_TRUSTED);
+					reg->type |= PTR_UNTRUSTED;
+				}
+			}));
+			env->cur_state->active_rcu_lock = false;
+		} else if (sleepable) {
+			verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
+			return -EACCES;
+		}
+	} else if (rcu_lock) {
+		env->cur_state->active_rcu_lock = true;
+	} else if (rcu_unlock) {
+		verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
+		return -EINVAL;
+	}
+
 	/* Check the arguments */
 	err = check_kfunc_args(env, &meta);
 	if (err < 0)
@@ -11754,6 +11845,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
+	if (env->cur_state->active_rcu_lock) {
+		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
+		return -EINVAL;
+	}
+
 	if (regs[ctx_reg].type != PTR_TO_CTX) {
 		verbose(env,
 			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -13019,6 +13115,9 @@ static bool states_equal(struct bpf_verifier_env *env,
 	    old->active_lock.id != cur->active_lock.id)
 		return false;
 
+	if (old->active_rcu_lock != cur->active_rcu_lock)
+		return false;
+
 	/* for states to be equal callsites have to be the same
 	 * and all frame states need to be equivalent
 	 */
@@ -13706,6 +13805,11 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
+				if (env->cur_state->active_rcu_lock) {
+					verbose(env, "bpf_rcu_read_unlock is missing\n");
+					return -EINVAL;
+				}
+
 				/* We must do check_reference_leak here before
 				 * prepare_func_exit to handle the case when
 				 * state->curframe > 0, it may be a callback
@@ -15494,14 +15598,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto patch_call_imm;
 		}
 
-		if (insn->imm == BPF_FUNC_task_storage_get ||
-		    insn->imm == BPF_FUNC_sk_storage_get ||
-		    insn->imm == BPF_FUNC_inode_storage_get ||
-		    insn->imm == BPF_FUNC_cgrp_storage_get) {
-			if (env->prog->aux->sleepable)
-				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
-			else
+		if (is_storage_get_function(insn->imm)) {
+			if (!env->prog->aux->sleepable ||
+			    env->insn_aux_data[i + delta].storage_get_func_atomic)
 				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+			else
+				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
 			insn_buf[1] = *insn;
 			cnt = 2;
 
@@ -16580,6 +16682,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
 	env->bypass_spec_v1 = bpf_bypass_spec_v1();
 	env->bypass_spec_v4 = bpf_bypass_spec_v4();
 	env->bpf_capable = bpf_capable();
+	env->rcu_tag_supported = btf_vmlinux &&
+		btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0;
 
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
-- 
cgit v1.2.3


From a66d79ee0bd5140a64b72cde588f8c83a55a1eb9 Mon Sep 17 00:00:00 2001
From: Sujuan Chen <sujuan.chen@mediatek.com>
Date: Thu, 24 Nov 2022 11:18:14 +0800
Subject: net: ethernet: mtk_wed: add wcid overwritten support for wed v1

All wed versions should enable the wcid overwritten feature,
since the wcid size is controlled by the wlan driver.

Tested-by: Sujuan Chen <sujuan.chen@mediatek.com>
Co-developed-by: Bo Jiao <bo.jiao@mediatek.com>
Signed-off-by: Bo Jiao <bo.jiao@mediatek.com>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mediatek/mtk_wed.c      | 13 ++++++++-----
 drivers/net/ethernet/mediatek/mtk_wed_regs.h |  2 ++
 include/linux/soc/mediatek/mtk_wed.h         |  3 +++
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index 7d8842378c2b..16d7af88bbac 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -526,9 +526,9 @@ mtk_wed_dma_disable(struct mtk_wed_device *dev)
 			MTK_WED_WPDMA_RX_D_RX_DRV_EN);
 		wed_clr(dev, MTK_WED_WDMA_GLO_CFG,
 			MTK_WED_WDMA_GLO_CFG_TX_DDONE_CHK);
-
-		mtk_wed_set_512_support(dev, false);
 	}
+
+	mtk_wed_set_512_support(dev, false);
 }
 
 static void
@@ -1290,9 +1290,10 @@ mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
 		if (mtk_wed_rro_cfg(dev))
 			return;
 
-		mtk_wed_set_512_support(dev, dev->wlan.wcid_512);
 	}
 
+	mtk_wed_set_512_support(dev, dev->wlan.wcid_512);
+
 	mtk_wed_dma_enable(dev);
 	dev->running = true;
 }
@@ -1358,11 +1359,13 @@ mtk_wed_attach(struct mtk_wed_device *dev)
 	}
 
 	mtk_wed_hw_init_early(dev);
-	if (hw->version == 1)
+	if (hw->version == 1) {
 		regmap_update_bits(hw->hifsys, HIFSYS_DMA_AG_MAP,
 				   BIT(hw->index), 0);
-	else
+	} else {
+		dev->rev_id = wed_r32(dev, MTK_WED_REV_ID);
 		ret = mtk_wed_wo_init(hw);
+	}
 out:
 	if (ret)
 		mtk_wed_detach(dev);
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_regs.h b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
index 9e39dace95eb..873d50b9a6e6 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_regs.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
@@ -20,6 +20,8 @@ struct mtk_wdma_desc {
 	__le32 info;
 } __packed __aligned(4);
 
+#define MTK_WED_REV_ID					0x004
+
 #define MTK_WED_RESET					0x008
 #define MTK_WED_RESET_TX_BM				BIT(0)
 #define MTK_WED_RESET_TX_FREE_AGENT			BIT(4)
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 8294978f4bca..1b1ef57609f7 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -85,6 +85,9 @@ struct mtk_wed_device {
 	int irq;
 	u8 version;
 
+	/* used by wlan driver */
+	u32 rev_id;
+
 	struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES];
 	struct mtk_wed_ring rx_ring[MTK_WED_RX_QUEUES];
 	struct mtk_wed_ring txfree_ring;
-- 
cgit v1.2.3


From 4c47867bc789bdc722f3bb760355c2c246fbe9af Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 24 Nov 2022 12:15:54 +0100
Subject: of: net: export of_get_mac_address_nvmem()

Export

	of_get_mac_addr_nvmem()

and rename it to

	of_get_mac_address_nvmem()

in order to fit the convention followed by the existing exported helpers
of the same kind.

This way, OF compatible drivers using eg. fwnode_get_mac_address() can
do a direct call to it instead of calling of_get_mac_address() just for
the nvmem step, avoiding to repeat an expensive DT lookup which has
already been done once.

Eventually, fwnode_get_mac_address() should probably be updated to
perform the nvmem lookup directly, but as of today, nvmem cells seem not
to be supported by ACPI yet which would defeat this kind of extension.

Suggested-by: Marcin Wojtas <mw@semihalf.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/of_net.h | 6 ++++++
 net/core/of_net.c      | 5 +++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_net.h b/include/linux/of_net.h
index 0484b613ca64..d88715a0b3a5 100644
--- a/include/linux/of_net.h
+++ b/include/linux/of_net.h
@@ -14,6 +14,7 @@
 struct net_device;
 extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface);
 extern int of_get_mac_address(struct device_node *np, u8 *mac);
+extern int of_get_mac_address_nvmem(struct device_node *np, u8 *mac);
 int of_get_ethdev_address(struct device_node *np, struct net_device *dev);
 extern struct net_device *of_find_net_device_by_node(struct device_node *np);
 #else
@@ -28,6 +29,11 @@ static inline int of_get_mac_address(struct device_node *np, u8 *mac)
 	return -ENODEV;
 }
 
+static inline int of_get_mac_address_nvmem(struct device_node *np, u8 *mac)
+{
+	return -ENODEV;
+}
+
 static inline int of_get_ethdev_address(struct device_node *np, struct net_device *dev)
 {
 	return -ENODEV;
diff --git a/net/core/of_net.c b/net/core/of_net.c
index f1a9bf7578e7..55d3fe229269 100644
--- a/net/core/of_net.c
+++ b/net/core/of_net.c
@@ -57,7 +57,7 @@ static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr)
 	return -ENODEV;
 }
 
-static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr)
+int of_get_mac_address_nvmem(struct device_node *np, u8 *addr)
 {
 	struct platform_device *pdev = of_find_device_by_node(np);
 	struct nvmem_cell *cell;
@@ -94,6 +94,7 @@ static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr)
 
 	return 0;
 }
+EXPORT_SYMBOL(of_get_mac_address_nvmem);
 
 /**
  * of_get_mac_address()
@@ -140,7 +141,7 @@ int of_get_mac_address(struct device_node *np, u8 *addr)
 	if (!ret)
 		return 0;
 
-	return of_get_mac_addr_nvmem(np, addr);
+	return of_get_mac_address_nvmem(np, addr);
 }
 EXPORT_SYMBOL(of_get_mac_address);
 
-- 
cgit v1.2.3


From f78cd9c783e09a0fe454b0fc8b39c22025d7869e Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Thu, 24 Nov 2022 16:22:53 +0100
Subject: net: ethernet: mtk_wed: update mtk_wed_stop

Update mtk_wed_stop routine and rename old mtk_wed_stop() to
mtk_wed_deinit(). This is a preliminary patch to add Wireless Ethernet
Dispatcher reset support.

Co-developed-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_wed.c | 32 +++++++++++++++++++++++---------
 include/linux/soc/mediatek/mtk_wed.h    |  4 ++++
 2 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index dbe10bbdb6f6..42b7e0407a9b 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -539,14 +539,8 @@ mtk_wed_dma_disable(struct mtk_wed_device *dev)
 static void
 mtk_wed_stop(struct mtk_wed_device *dev)
 {
-	mtk_wed_dma_disable(dev);
 	mtk_wed_set_ext_int(dev, false);
 
-	wed_clr(dev, MTK_WED_CTRL,
-		MTK_WED_CTRL_WDMA_INT_AGENT_EN |
-		MTK_WED_CTRL_WPDMA_INT_AGENT_EN |
-		MTK_WED_CTRL_WED_TX_BM_EN |
-		MTK_WED_CTRL_WED_TX_FREE_AGENT_EN);
 	wed_w32(dev, MTK_WED_WPDMA_INT_TRIGGER, 0);
 	wed_w32(dev, MTK_WED_WDMA_INT_TRIGGER, 0);
 	wdma_w32(dev, MTK_WDMA_INT_MASK, 0);
@@ -558,7 +552,27 @@ mtk_wed_stop(struct mtk_wed_device *dev)
 
 	wed_w32(dev, MTK_WED_EXT_INT_MASK1, 0);
 	wed_w32(dev, MTK_WED_EXT_INT_MASK2, 0);
-	wed_clr(dev, MTK_WED_CTRL, MTK_WED_CTRL_WED_RX_BM_EN);
+}
+
+static void
+mtk_wed_deinit(struct mtk_wed_device *dev)
+{
+	mtk_wed_stop(dev);
+	mtk_wed_dma_disable(dev);
+
+	wed_clr(dev, MTK_WED_CTRL,
+		MTK_WED_CTRL_WDMA_INT_AGENT_EN |
+		MTK_WED_CTRL_WPDMA_INT_AGENT_EN |
+		MTK_WED_CTRL_WED_TX_BM_EN |
+		MTK_WED_CTRL_WED_TX_FREE_AGENT_EN);
+
+	if (dev->hw->version == 1)
+		return;
+
+	wed_clr(dev, MTK_WED_CTRL,
+		MTK_WED_CTRL_RX_ROUTE_QM_EN |
+		MTK_WED_CTRL_WED_RX_BM_EN |
+		MTK_WED_CTRL_RX_RRO_QM_EN);
 }
 
 static void
@@ -568,7 +582,7 @@ mtk_wed_detach(struct mtk_wed_device *dev)
 
 	mutex_lock(&hw_lock);
 
-	mtk_wed_stop(dev);
+	mtk_wed_deinit(dev);
 
 	mtk_wdma_rx_reset(dev);
 	mtk_wed_reset(dev, MTK_WED_RESET_WED);
@@ -670,7 +684,7 @@ mtk_wed_hw_init_early(struct mtk_wed_device *dev)
 {
 	u32 mask, set;
 
-	mtk_wed_stop(dev);
+	mtk_wed_deinit(dev);
 	mtk_wed_reset(dev, MTK_WED_RESET_WED);
 	mtk_wed_set_wpdma(dev);
 
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 1b1ef57609f7..c43510e541df 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -234,6 +234,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
 	(_dev)->ops->ppe_check(_dev, _skb, _reason, _hash)
 #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \
 	(_dev)->ops->msg_update(_dev, _id, _msg, _len)
+#define mtk_wed_device_stop(_dev) (_dev)->ops->stop(_dev)
+#define mtk_wed_device_dma_reset(_dev) (_dev)->ops->reset_dma(_dev)
 #else
 static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 {
@@ -250,6 +252,8 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 #define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV
 #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash)  do {} while (0)
 #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV
+#define mtk_wed_device_stop(_dev) do {} while (0)
+#define mtk_wed_device_dma_reset(_dev) do {} while (0)
 #endif
 
 #endif
-- 
cgit v1.2.3


From 23dca7a90017ff2512c501f7da4c7ca7a95c2d6e Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Thu, 24 Nov 2022 16:22:55 +0100
Subject: net: ethernet: mtk_wed: add reset to tx_ring_setup callback

Introduce reset parameter to mtk_wed_tx_ring_setup signature.
This is a preliminary patch to add Wireless Ethernet Dispatcher reset
support.

Co-developed-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_wed.c  | 19 +++++++++++--------
 drivers/net/wireless/mediatek/mt76/dma.c |  2 +-
 include/linux/soc/mediatek/mtk_wed.h     |  8 ++++----
 3 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index 9594885d5e50..d041615b2bac 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -1181,7 +1181,8 @@ mtk_wed_ring_alloc(struct mtk_wed_device *dev, struct mtk_wed_ring *ring,
 }
 
 static int
-mtk_wed_wdma_rx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
+mtk_wed_wdma_rx_ring_setup(struct mtk_wed_device *dev, int idx, int size,
+			   bool reset)
 {
 	u32 desc_size = sizeof(struct mtk_wdma_desc) * dev->hw->version;
 	struct mtk_wed_ring *wdma;
@@ -1190,8 +1191,8 @@ mtk_wed_wdma_rx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
 		return -EINVAL;
 
 	wdma = &dev->rx_wdma[idx];
-	if (mtk_wed_ring_alloc(dev, wdma, MTK_WED_WDMA_RING_SIZE, desc_size,
-			       true))
+	if (!reset && mtk_wed_ring_alloc(dev, wdma, MTK_WED_WDMA_RING_SIZE,
+					 desc_size, true))
 		return -ENOMEM;
 
 	wdma_w32(dev, MTK_WDMA_RING_RX(idx) + MTK_WED_RING_OFS_BASE,
@@ -1389,7 +1390,7 @@ mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
 
 	for (i = 0; i < ARRAY_SIZE(dev->rx_wdma); i++)
 		if (!dev->rx_wdma[i].desc)
-			mtk_wed_wdma_rx_ring_setup(dev, i, 16);
+			mtk_wed_wdma_rx_ring_setup(dev, i, 16, false);
 
 	mtk_wed_hw_init(dev);
 	mtk_wed_configure_irq(dev, irq_mask);
@@ -1498,7 +1499,8 @@ unlock:
 }
 
 static int
-mtk_wed_tx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs)
+mtk_wed_tx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs,
+		      bool reset)
 {
 	struct mtk_wed_ring *ring = &dev->tx_ring[idx];
 
@@ -1517,11 +1519,12 @@ mtk_wed_tx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs)
 	if (WARN_ON(idx >= ARRAY_SIZE(dev->tx_ring)))
 		return -EINVAL;
 
-	if (mtk_wed_ring_alloc(dev, ring, MTK_WED_TX_RING_SIZE,
-			       sizeof(*ring->desc), true))
+	if (!reset && mtk_wed_ring_alloc(dev, ring, MTK_WED_TX_RING_SIZE,
+					 sizeof(*ring->desc), true))
 		return -ENOMEM;
 
-	if (mtk_wed_wdma_rx_ring_setup(dev, idx, MTK_WED_WDMA_RING_SIZE))
+	if (mtk_wed_wdma_rx_ring_setup(dev, idx, MTK_WED_WDMA_RING_SIZE,
+				       reset))
 		return -ENOMEM;
 
 	ring->reg_base = MTK_WED_RING_TX(idx);
diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index 7378c4d1e156..28cbf07046c3 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -502,7 +502,7 @@ mt76_dma_wed_setup(struct mt76_dev *dev, struct mt76_queue *q)
 
 	switch (type) {
 	case MT76_WED_Q_TX:
-		ret = mtk_wed_device_tx_ring_setup(wed, ring, q->regs);
+		ret = mtk_wed_device_tx_ring_setup(wed, ring, q->regs, false);
 		if (!ret)
 			q->wed_regs = wed->tx_ring[ring].reg_base;
 		break;
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index c43510e541df..beb190449704 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -158,7 +158,7 @@ struct mtk_wed_device {
 struct mtk_wed_ops {
 	int (*attach)(struct mtk_wed_device *dev);
 	int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring,
-			     void __iomem *regs);
+			     void __iomem *regs, bool reset);
 	int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring,
 			     void __iomem *regs);
 	int (*txfree_ring_setup)(struct mtk_wed_device *dev,
@@ -216,8 +216,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
 #define mtk_wed_device_active(_dev) !!(_dev)->ops
 #define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev)
 #define mtk_wed_device_start(_dev, _mask) (_dev)->ops->start(_dev, _mask)
-#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) \
-	(_dev)->ops->tx_ring_setup(_dev, _ring, _regs)
+#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs, _reset) \
+	(_dev)->ops->tx_ring_setup(_dev, _ring, _regs, _reset)
 #define mtk_wed_device_txfree_ring_setup(_dev, _regs) \
 	(_dev)->ops->txfree_ring_setup(_dev, _regs)
 #define mtk_wed_device_reg_read(_dev, _reg) \
@@ -243,7 +243,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 }
 #define mtk_wed_device_detach(_dev) do {} while (0)
 #define mtk_wed_device_start(_dev, _mask) do {} while (0)
-#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) -ENODEV
+#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs, _reset) -ENODEV
 #define mtk_wed_device_txfree_ring_setup(_dev, _ring, _regs) -ENODEV
 #define mtk_wed_device_reg_read(_dev, _reg) 0
 #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0)
-- 
cgit v1.2.3


From 12eb0f84a601cec8920b56d7ffd4182a6bfd6521 Mon Sep 17 00:00:00 2001
From: Petr Pavlu <petr.pavlu@suse.com>
Date: Thu, 29 Sep 2022 13:45:08 +0200
Subject: net/mlx5: Remove unused ctx variables

Remove mlx5_priv.ctx_list and ctx_lock which are no longer used after
commit 601c10c89cbb ("net/mlx5: Delete custom device management logic").

Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 --
 include/linux/mlx5/driver.h                    | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 25e87e5d9270..7f5db13e3550 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1603,8 +1603,6 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 	int err;
 
 	memcpy(&dev->profile, &profile[profile_idx], sizeof(dev->profile));
-	INIT_LIST_HEAD(&priv->ctx_list);
-	spin_lock_init(&priv->ctx_lock);
 	lockdep_register_key(&dev->lock_key);
 	mutex_init(&dev->intf_state_mutex);
 	lockdep_set_class(&dev->intf_state_mutex, &dev->lock_key);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 06cbad166225..d476255c9a3f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -606,8 +606,6 @@ struct mlx5_priv {
 	struct list_head        pgdir_list;
 	/* end: alloc staff */
 
-	struct list_head        ctx_list;
-	spinlock_t              ctx_lock;
 	struct mlx5_adev       **adev;
 	int			adev_idx;
 	int			sw_vhca_id;
-- 
cgit v1.2.3


From b146658f2ed90768b769222ca418617274242b32 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Mon, 31 Oct 2022 14:18:22 +0200
Subject: net/mlx5e: Add padding when needed in UMR WQEs

Per the device spec, MTTs/KLMs list in a UMR WQE must be aligned to 64B.
Per our SW design, the MTT/KLMs list would need alignment only if it's
too small, for example on PPC when PAGE_SIZE is 64KB, and only 4 pages
are needed to cover a MPWQE of size 256KB.

Padding, if needed, is taken into account when calculating the UMR WQE
fields (ds_cnt and xlt_octowords), however no entries are provided,
instead garbage is passed.

No real harm though, as these parts act as gaps between the RX MPWQEs
and not used by any of them. Hence, in practice, device does not try to
write any incoming packet to them. Still, prefer providing clean padding
marking the end of the list, and do not map garbage into the RQ memory
region.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 11 +++++++++++
 include/linux/mlx5/device.h                     |  1 +
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index b1ea0b995d9c..8d71736116e0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -727,6 +727,17 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 		};
 	}
 
+	/* Pad if needed, in case the value set to ucseg->xlt_octowords
+	 * in mlx5e_build_umr_wqe() needed alignment.
+	 */
+	if (rq->mpwqe.pages_per_wqe & (MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT - 1)) {
+		int pad = ALIGN(rq->mpwqe.pages_per_wqe, MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT) -
+			rq->mpwqe.pages_per_wqe;
+
+		memset(&umr_wqe->inline_mtts[rq->mpwqe.pages_per_wqe], 0,
+		       sizeof(*umr_wqe->inline_mtts) * pad);
+	}
+
 	bitmap_zero(wi->xdp_xmit_bitmap, rq->mpwqe.pages_per_wqe);
 	wi->consumed_strides = 0;
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index eb3fac30488b..97275965f156 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -294,6 +294,7 @@ enum {
 #define MLX5_UMR_MTT_ALIGNMENT 0x40
 #define MLX5_UMR_MTT_MASK      (MLX5_UMR_MTT_ALIGNMENT - 1)
 #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT
+#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt))
 
 #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8)
 
-- 
cgit v1.2.3


From 683d78a0d46235edfd3c50ddbc4af1065b422670 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Mon, 31 Oct 2022 14:30:23 +0200
Subject: net/mlx5: Remove unused UMR MTT definitions

Defines MLX5_UMR_MTT_MASK and MLX5_UMR_MTT_MIN_CHUNK_SIZE are not in
use. Remove them.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 97275965f156..ecf840e2989b 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -292,8 +292,6 @@ enum {
 
 #define MLX5_UMR_KLM_ALIGNMENT 4
 #define MLX5_UMR_MTT_ALIGNMENT 0x40
-#define MLX5_UMR_MTT_MASK      (MLX5_UMR_MTT_ALIGNMENT - 1)
-#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT
 #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt))
 
 #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8)
-- 
cgit v1.2.3


From 02648b4b09d506bd4df2e17bf109c229fc728640 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Mon, 31 Oct 2022 14:44:57 +0200
Subject: net/mlx5: Generalize name of UMR alignment definition

Per the device spec, MLX5_UMR_MTT_ALIGNMENT is good not only for UMR MTT
entries, but for all other entries as well, like KLMs and KSMs.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/infiniband/hw/mlx5/odp.c                    |  3 +--
 drivers/infiniband/hw/mlx5/umr.c                    | 14 +++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/en.h        |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en/params.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   |  2 +-
 include/linux/mlx5/device.h                         |  4 ++--
 6 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index bc97958818bb..e6e021af6aa9 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -230,8 +230,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
 	struct ib_umem_odp *umem_odp =
 		container_of(mni, struct ib_umem_odp, notifier);
 	struct mlx5_ib_mr *mr;
-	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
-				    sizeof(struct mlx5_mtt)) - 1;
+	const u64 umr_block_mask = MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT - 1;
 	u64 idx = 0, blk_start_idx = 0;
 	u64 invalidations = 0;
 	unsigned long start;
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index d5105b5c9979..029e9536ec28 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -418,7 +418,7 @@ int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
 }
 
 #define MLX5_MAX_UMR_CHUNK                                                     \
-	((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_MTT_ALIGNMENT)
+	((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_FLEX_ALIGNMENT)
 #define MLX5_SPARE_UMR_CHUNK 0x10000
 
 /*
@@ -428,11 +428,11 @@ int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
  */
 static void *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
 {
-	const size_t xlt_chunk_align = MLX5_UMR_MTT_ALIGNMENT / ent_size;
+	const size_t xlt_chunk_align = MLX5_UMR_FLEX_ALIGNMENT / ent_size;
 	size_t size;
 	void *res = NULL;
 
-	static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0);
+	static_assert(PAGE_SIZE % MLX5_UMR_FLEX_ALIGNMENT == 0);
 
 	/*
 	 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
@@ -666,7 +666,7 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
 	}
 
 	final_size = (void *)cur_mtt - (void *)mtt;
-	sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT);
+	sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT);
 	memset(cur_mtt, 0, sg.length - final_size);
 	mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
 
@@ -690,7 +690,7 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
 			       ? sizeof(struct mlx5_klm)
 			       : sizeof(struct mlx5_mtt);
-	const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
+	const int page_align = MLX5_UMR_FLEX_ALIGNMENT / desc_size;
 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
 	struct device *ddev = &dev->mdev->pdev->dev;
 	const int page_mask = page_align - 1;
@@ -711,7 +711,7 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 	if (WARN_ON(!mr->umem->is_odp))
 		return -EINVAL;
 
-	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
+	/* UMR copies MTTs in units of MLX5_UMR_FLEX_ALIGNMENT bytes,
 	 * so we need to align the offset and length accordingly
 	 */
 	if (idx & page_mask) {
@@ -748,7 +748,7 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
 		dma_sync_single_for_device(ddev, sg.addr, sg.length,
 					   DMA_TO_DEVICE);
-		sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
+		sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
 
 		if (pages_mapped + pages_iter >= pages_to_map)
 			mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index ff5b302531d5..3cad59ac1b48 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -103,11 +103,11 @@ struct page_pool;
  * size actually used at runtime, but it's not a problem when calculating static
  * array sizes.
  */
-#define MLX5_UMR_MAX_MTT_SPACE \
+#define MLX5_UMR_MAX_FLEX_SPACE \
 	(ALIGN_DOWN(MLX5_SEND_WQE_MAX_SIZE - sizeof(struct mlx5e_umr_wqe), \
-		    MLX5_UMR_MTT_ALIGNMENT))
+		    MLX5_UMR_FLEX_ALIGNMENT))
 #define MLX5_MPWRQ_MAX_PAGES_PER_WQE \
-	rounddown_pow_of_two(MLX5_UMR_MAX_MTT_SPACE / sizeof(struct mlx5_mtt))
+	rounddown_pow_of_two(MLX5_UMR_MAX_FLEX_SPACE / sizeof(struct mlx5_mtt))
 
 #define MLX5E_MAX_RQ_NUM_MTTS	\
 	(ALIGN_DOWN(U16_MAX, 4) * 2) /* Fits into u16 and aligned by WQEBB. */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index 9ec9662d1d0b..585bdc8383ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -107,7 +107,7 @@ u8 mlx5e_mpwrq_log_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift,
 	/* Keep in sync with MLX5_MPWRQ_MAX_PAGES_PER_WQE. */
 	max_wqe_size = mlx5e_get_max_sq_aligned_wqebbs(mdev) * MLX5_SEND_WQE_BB;
 	max_pages_per_wqe = ALIGN_DOWN(max_wqe_size - sizeof(struct mlx5e_umr_wqe),
-				       MLX5_UMR_MTT_ALIGNMENT) / umr_entry_size;
+				       MLX5_UMR_FLEX_ALIGNMENT) / umr_entry_size;
 	max_log_mpwqe_size = ilog2(max_pages_per_wqe) + page_shift;
 
 	WARN_ON_ONCE(max_log_mpwqe_size < MLX5E_ORDER2_MAX_PACKET_MTU);
@@ -146,7 +146,7 @@ u16 mlx5e_mpwrq_umr_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift,
 	u16 umr_wqe_sz;
 
 	umr_wqe_sz = sizeof(struct mlx5e_umr_wqe) +
-		ALIGN(pages_per_wqe * umr_entry_size, MLX5_UMR_MTT_ALIGNMENT);
+		ALIGN(pages_per_wqe * umr_entry_size, MLX5_UMR_FLEX_ALIGNMENT);
 
 	WARN_ON_ONCE(DIV_ROUND_UP(umr_wqe_sz, MLX5_SEND_WQE_DS) > MLX5_WQE_CTRL_DS_MASK);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 217c8a478977..199387c6bf16 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -208,7 +208,7 @@ static u16 mlx5e_mpwrq_umr_octowords(u32 entries, enum mlx5e_mpwrq_umr_mode umr_
 	u8 umr_entry_size = mlx5e_mpwrq_umr_entry_size(umr_mode);
 	u32 sz;
 
-	sz = ALIGN(entries * umr_entry_size, MLX5_UMR_MTT_ALIGNMENT);
+	sz = ALIGN(entries * umr_entry_size, MLX5_UMR_FLEX_ALIGNMENT);
 
 	return sz / MLX5_OCTWORD;
 }
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index ecf840e2989b..a02f779f5c5b 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -291,8 +291,8 @@ enum {
 };
 
 #define MLX5_UMR_KLM_ALIGNMENT 4
-#define MLX5_UMR_MTT_ALIGNMENT 0x40
-#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt))
+#define MLX5_UMR_FLEX_ALIGNMENT 0x40
+#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt))
 
 #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8)
 
-- 
cgit v1.2.3


From daab2e9c54a52dea8dbd1af516e6f986eeed2556 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Mon, 31 Oct 2022 14:24:02 +0200
Subject: net/mlx5: Use generic definition for UMR KLM alignment

MLX5_UMR_KLM_ALIGNMENT is in units of number of entries, while
MLX5_UMR_MTT_ALIGNMENT (generalized and renamed to
MLX5_UMR_FLEX_ALIGNMENT) is in byte units. This is misleading and
confusing.
Replace this KLM definition with one based on the generic definition.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 10 +++++-----
 include/linux/mlx5/device.h                     |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3cad59ac1b48..65790ff58a74 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -160,7 +160,7 @@ struct page_pool;
 	(((wqe_size) - sizeof(struct mlx5e_umr_wqe)) / sizeof(struct mlx5_klm))
 
 #define MLX5E_KLM_ENTRIES_PER_WQE(wqe_size)\
-	ALIGN_DOWN(MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size), MLX5_UMR_KLM_ALIGNMENT)
+	ALIGN_DOWN(MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size), MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT)
 
 #define MLX5E_MAX_KLM_PER_WQE(mdev) \
 	MLX5E_KLM_ENTRIES_PER_WQE(MLX5_SEND_WQE_BB * mlx5e_get_max_sq_aligned_wqebbs(mdev))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 8d71736116e0..c8820ab22169 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -593,8 +593,8 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq,
 	int headroom, i;
 
 	headroom = rq->buff.headroom;
-	new_entries = klm_entries - (shampo->pi & (MLX5_UMR_KLM_ALIGNMENT - 1));
-	entries = ALIGN(klm_entries, MLX5_UMR_KLM_ALIGNMENT);
+	new_entries = klm_entries - (shampo->pi & (MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT - 1));
+	entries = ALIGN(klm_entries, MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT);
 	wqe_bbs = MLX5E_KLM_UMR_WQEBBS(entries);
 	pi = mlx5e_icosq_get_next_pi(sq, wqe_bbs);
 	umr_wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
@@ -603,7 +603,7 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq,
 	for (i = 0; i < entries; i++, index++) {
 		dma_info = &shampo->info[index];
 		if (i >= klm_entries || (index < shampo->pi && shampo->pi - index <
-					 MLX5_UMR_KLM_ALIGNMENT))
+					 MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT))
 			goto update_klm;
 		header_offset = (index & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) <<
 			MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE;
@@ -668,8 +668,8 @@ static int mlx5e_alloc_rx_hd_mpwqe(struct mlx5e_rq *rq)
 	if (!klm_entries)
 		return 0;
 
-	klm_entries += (shampo->pi & (MLX5_UMR_KLM_ALIGNMENT - 1));
-	index = ALIGN_DOWN(shampo->pi, MLX5_UMR_KLM_ALIGNMENT);
+	klm_entries += (shampo->pi & (MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT - 1));
+	index = ALIGN_DOWN(shampo->pi, MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT);
 	entries_before = shampo->hd_per_wq - index;
 
 	if (unlikely(entries_before < klm_entries))
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index a02f779f5c5b..5fe5d198b57a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -290,9 +290,9 @@ enum {
 	MLX5_UMR_INLINE			= (1 << 7),
 };
 
-#define MLX5_UMR_KLM_ALIGNMENT 4
 #define MLX5_UMR_FLEX_ALIGNMENT 0x40
 #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt))
+#define MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_klm))
 
 #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8)
 
-- 
cgit v1.2.3


From e9374524950512a1769f610a868fcdf89ea59b8e Mon Sep 17 00:00:00 2001
From: Vishwanath Pai <vpai@akamai.com>
Date: Tue, 22 Nov 2022 20:30:57 +0100
Subject: netfilter: ipset: Add support for new bitmask parameter

Add a new parameter to complement the existing 'netmask' option. The
main difference between netmask and bitmask is that bitmask takes any
arbitrary ip address as input, it does not have to be a valid netmask.

The name of the new parameter is 'bitmask'. This lets us mask out
arbitrary bits in the ip address, for example:
ipset create set1 hash:ip bitmask 255.128.255.0
ipset create set2 hash:ip,port family inet6 bitmask ffff::ff80

Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Joshua Hunt <johunt@akamai.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h      | 10 ++++
 include/uapi/linux/netfilter/ipset/ip_set.h |  2 +
 net/netfilter/ipset/ip_set_hash_gen.h       | 71 +++++++++++++++++++++++++----
 net/netfilter/ipset/ip_set_hash_ip.c        | 19 ++++----
 net/netfilter/ipset/ip_set_hash_ipport.c    | 24 +++++++++-
 net/netfilter/ipset/ip_set_hash_netnet.c    | 26 +++++++++--
 6 files changed, 126 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index ada1296c87d5..ab934ad951a8 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -515,6 +515,16 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
 	*skbinfo = ext->skbinfo;
 }
 
+static inline void
+nf_inet_addr_mask_inplace(union nf_inet_addr *a1,
+			  const union nf_inet_addr *mask)
+{
+	a1->all[0] &= mask->all[0];
+	a1->all[1] &= mask->all[1];
+	a1->all[2] &= mask->all[2];
+	a1->all[3] &= mask->all[3];
+}
+
 #define IP_SET_INIT_KEXT(skb, opt, set)			\
 	{ .bytes = (skb)->len, .packets = 1, .target = true,\
 	  .timeout = ip_set_adt_opt_timeout(opt, set) }
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 79e5d68b87af..333807efd32b 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -85,6 +85,7 @@ enum {
 	IPSET_ATTR_CADT_LINENO = IPSET_ATTR_LINENO,	/* 9 */
 	IPSET_ATTR_MARK,	/* 10 */
 	IPSET_ATTR_MARKMASK,	/* 11 */
+	IPSET_ATTR_BITMASK,	/* 12 */
 	/* Reserve empty slots */
 	IPSET_ATTR_CADT_MAX = 16,
 	/* Create-only specific attributes */
@@ -153,6 +154,7 @@ enum ipset_errno {
 	IPSET_ERR_COMMENT,
 	IPSET_ERR_INVALID_MARKMASK,
 	IPSET_ERR_SKBINFO,
+	IPSET_ERR_BITMASK_NETMASK_EXCL,
 
 	/* Type specific error codes */
 	IPSET_ERR_TYPE_SPECIFIC = 4352,
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 3adc291d9ce1..fdb5225a3e5c 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -159,6 +159,17 @@ htable_size(u8 hbits)
 	(SET_WITH_TIMEOUT(set) &&	\
 	 ip_set_timeout_expired(ext_timeout(d, set)))
 
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+static const union nf_inet_addr onesmask = {
+	.all[0] = 0xffffffff,
+	.all[1] = 0xffffffff,
+	.all[2] = 0xffffffff,
+	.all[3] = 0xffffffff
+};
+
+static const union nf_inet_addr zeromask = {};
+#endif
+
 #endif /* _IP_SET_HASH_GEN_H */
 
 #ifndef MTYPE
@@ -283,8 +294,9 @@ struct htype {
 	u32 markmask;		/* markmask value for mark mask to store */
 #endif
 	u8 bucketsize;		/* max elements in an array block */
-#ifdef IP_SET_HASH_WITH_NETMASK
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
 	u8 netmask;		/* netmask value for subnets to store */
+	union nf_inet_addr bitmask;	/* stores bitmask */
 #endif
 	struct list_head ad;	/* Resize add|del backlist */
 	struct mtype_elem next; /* temporary storage for uadd */
@@ -459,8 +471,8 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
 	/* Resizing changes htable_bits, so we ignore it */
 	return x->maxelem == y->maxelem &&
 	       a->timeout == b->timeout &&
-#ifdef IP_SET_HASH_WITH_NETMASK
-	       x->netmask == y->netmask &&
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+	       nf_inet_addr_cmp(&x->bitmask, &y->bitmask) &&
 #endif
 #ifdef IP_SET_HASH_WITH_MARKMASK
 	       x->markmask == y->markmask &&
@@ -1264,9 +1276,21 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
 			  htonl(jhash_size(htable_bits))) ||
 	    nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
 		goto nla_put_failure;
+#ifdef IP_SET_HASH_WITH_BITMASK
+	/* if netmask is set to anything other than HOST_MASK we know that the user supplied netmask
+	 * and not bitmask. These two are mutually exclusive. */
+	if (h->netmask == HOST_MASK && !nf_inet_addr_cmp(&onesmask, &h->bitmask)) {
+		if (set->family == NFPROTO_IPV4) {
+			if (nla_put_ipaddr4(skb, IPSET_ATTR_BITMASK, h->bitmask.ip))
+				goto nla_put_failure;
+		} else if (set->family == NFPROTO_IPV6) {
+			if (nla_put_ipaddr6(skb, IPSET_ATTR_BITMASK, &h->bitmask.in6))
+				goto nla_put_failure;
+		}
+	}
+#endif
 #ifdef IP_SET_HASH_WITH_NETMASK
-	if (h->netmask != HOST_MASK &&
-	    nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
+	if (h->netmask != HOST_MASK && nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
 		goto nla_put_failure;
 #endif
 #ifdef IP_SET_HASH_WITH_MARKMASK
@@ -1429,8 +1453,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 	u32 markmask;
 #endif
 	u8 hbits;
-#ifdef IP_SET_HASH_WITH_NETMASK
-	u8 netmask;
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+	int ret __attribute__((unused)) = 0;
+	u8 netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
+	union nf_inet_addr bitmask = onesmask;
 #endif
 	size_t hsize;
 	struct htype *h;
@@ -1468,7 +1494,6 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 #endif
 
 #ifdef IP_SET_HASH_WITH_NETMASK
-	netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
 	if (tb[IPSET_ATTR_NETMASK]) {
 		netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
 
@@ -1476,6 +1501,33 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 		    (set->family == NFPROTO_IPV6 && netmask > 128) ||
 		    netmask == 0)
 			return -IPSET_ERR_INVALID_NETMASK;
+
+		/* we convert netmask to bitmask and store it */
+		if (set->family == NFPROTO_IPV4)
+			bitmask.ip = ip_set_netmask(netmask);
+		else
+			ip6_netmask(&bitmask, netmask);
+	}
+#endif
+
+#ifdef IP_SET_HASH_WITH_BITMASK
+	if (tb[IPSET_ATTR_BITMASK]) {
+		/* bitmask and netmask do the same thing, allow only one of these options */
+		if (tb[IPSET_ATTR_NETMASK])
+			return -IPSET_ERR_BITMASK_NETMASK_EXCL;
+
+		if (set->family == NFPROTO_IPV4) {
+			ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_BITMASK], &bitmask.ip);
+			if (ret || !bitmask.ip)
+				return -IPSET_ERR_INVALID_NETMASK;
+		} else if (set->family == NFPROTO_IPV6) {
+			ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_BITMASK], &bitmask);
+			if (ret || ipv6_addr_any(&bitmask.in6))
+				return -IPSET_ERR_INVALID_NETMASK;
+		}
+
+		if (nf_inet_addr_cmp(&bitmask, &zeromask))
+			return -IPSET_ERR_INVALID_NETMASK;
 	}
 #endif
 
@@ -1518,7 +1570,8 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 	for (i = 0; i < ahash_numof_locks(hbits); i++)
 		spin_lock_init(&t->hregion[i].lock);
 	h->maxelem = maxelem;
-#ifdef IP_SET_HASH_WITH_NETMASK
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+	h->bitmask = bitmask;
 	h->netmask = netmask;
 #endif
 #ifdef IP_SET_HASH_WITH_MARKMASK
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index dd30c03d5a23..556f74268a28 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -24,7 +24,8 @@
 /*				2	   Comments support */
 /*				3	   Forceadd support */
 /*				4	   skbinfo support */
-#define IPSET_TYPE_REV_MAX	5	/* bucketsize, initval support  */
+/*				5	   bucketsize, initval support  */
+#define IPSET_TYPE_REV_MAX	6	/* bitmask support  */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -34,6 +35,7 @@ MODULE_ALIAS("ip_set_hash:ip");
 /* Type specific function prefix */
 #define HTYPE		hash_ip
 #define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
 
 /* IPv4 variant */
 
@@ -86,7 +88,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	__be32 ip;
 
 	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip);
-	ip &= ip_set_netmask(h->netmask);
+	ip &= h->bitmask.ip;
 	if (ip == 0)
 		return -EINVAL;
 
@@ -119,7 +121,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (ret)
 		return ret;
 
-	ip &= ip_set_hostmask(h->netmask);
+	ip &= ntohl(h->bitmask.ip);
 	e.ip = htonl(ip);
 	if (e.ip == 0)
 		return -IPSET_ERR_HASH_ELEM;
@@ -187,12 +189,6 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
 	return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6);
 }
 
-static void
-hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix)
-{
-	ip6_netmask(ip, prefix);
-}
-
 static bool
 hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e)
 {
@@ -229,7 +225,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
 	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
-	hash_ip6_netmask(&e.ip, h->netmask);
+	nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
 	if (ipv6_addr_any(&e.ip.in6))
 		return -EINVAL;
 
@@ -268,7 +264,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (ret)
 		return ret;
 
-	hash_ip6_netmask(&e.ip, h->netmask);
+	nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
 	if (ipv6_addr_any(&e.ip.in6))
 		return -IPSET_ERR_HASH_ELEM;
 
@@ -295,6 +291,7 @@ static struct ip_set_type hash_ip_type __read_mostly = {
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_NETMASK]	= { .type = NLA_U8  },
+		[IPSET_ATTR_BITMASK]	= { .type = NLA_NESTED },
 		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
 	},
 	.adt_policy	= {
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 7303138e46be..2ffbd0b78a8c 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -26,7 +26,8 @@
 /*				3    Comments support added */
 /*				4    Forceadd support added */
 /*				5    skbinfo support added */
-#define IPSET_TYPE_REV_MAX	6 /* bucketsize, initval support added */
+/*				6    bucketsize, initval support added */
+#define IPSET_TYPE_REV_MAX	7 /* bitmask support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -35,6 +36,8 @@ MODULE_ALIAS("ip_set_hash:ip,port");
 
 /* Type specific function prefix */
 #define HTYPE		hash_ipport
+#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
 
 /* IPv4 variant */
 
@@ -92,12 +95,16 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipport4_elem e = { .ip = 0 };
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+	const struct MTYPE *h = set->data;
 
 	if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
 				 &e.port, &e.proto))
 		return -EINVAL;
 
 	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+	e.ip &= h->bitmask.ip;
+	if (e.ip == 0)
+		return -EINVAL;
 	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
 }
 
@@ -129,6 +136,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (ret)
 		return ret;
 
+	e.ip &= h->bitmask.ip;
+	if (e.ip == 0)
+		return -EINVAL;
+
 	e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
 
 	if (tb[IPSET_ATTR_PROTO]) {
@@ -253,12 +264,17 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipport6_elem e = { .ip = { .all = { 0 } } };
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+	const struct MTYPE *h = set->data;
 
 	if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
 				 &e.port, &e.proto))
 		return -EINVAL;
 
 	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+	nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
+	if (ipv6_addr_any(&e.ip.in6))
+		return -EINVAL;
+
 	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
 }
 
@@ -298,6 +314,10 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (ret)
 		return ret;
 
+	nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
+	if (ipv6_addr_any(&e.ip.in6))
+		return -EINVAL;
+
 	e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
 
 	if (tb[IPSET_ATTR_PROTO]) {
@@ -356,6 +376,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
 		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
+		[IPSET_ATTR_NETMASK]	= { .type = NLA_U8 },
+		[IPSET_ATTR_BITMASK]	= { .type = NLA_NESTED },
 	},
 	.adt_policy	= {
 		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 3d09eefe998a..cdfb78c6e0d3 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -23,7 +23,8 @@
 #define IPSET_TYPE_REV_MIN	0
 /*				1	   Forceadd support added */
 /*				2	   skbinfo support added */
-#define IPSET_TYPE_REV_MAX	3	/* bucketsize, initval support added */
+/*				3	   bucketsize, initval support added */
+#define IPSET_TYPE_REV_MAX	4	/* bitmask support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -33,6 +34,8 @@ MODULE_ALIAS("ip_set_hash:net,net");
 /* Type specific function prefix */
 #define HTYPE		hash_netnet
 #define IP_SET_HASH_WITH_NETS
+#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
 #define IPSET_NET_COUNT 2
 
 /* IPv4 variants */
@@ -153,8 +156,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
 	ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]);
-	e.ip[0] &= ip_set_netmask(e.cidr[0]);
-	e.ip[1] &= ip_set_netmask(e.cidr[1]);
+	e.ip[0] &= (ip_set_netmask(e.cidr[0]) & h->bitmask.ip);
+	e.ip[1] &= (ip_set_netmask(e.cidr[1]) & h->bitmask.ip);
 
 	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
 }
@@ -213,8 +216,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] ||
 				   tb[IPSET_ATTR_IP2_TO])) {
-		e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
-		e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
+		e.ip[0] = htonl(ip & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[0]));
+		e.ip[1] = htonl(ip2_from & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[1]));
 		ret = adtfn(set, &e, &ext, &ext, flags);
 		return ip_set_enomatch(ret, flags, adt, set) ? -ret :
 		       ip_set_eexist(ret, flags) ? 0 : ret;
@@ -404,6 +407,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ip6_netmask(&e.ip[0], e.cidr[0]);
 	ip6_netmask(&e.ip[1], e.cidr[1]);
 
+	nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask);
+	nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask);
+	if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6))
+		return -EINVAL;
+
 	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
 }
 
@@ -414,6 +422,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netnet6_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+	const struct hash_netnet6 *h = set->data;
 	int ret;
 
 	if (tb[IPSET_ATTR_LINENO])
@@ -453,6 +462,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 	ip6_netmask(&e.ip[0], e.cidr[0]);
 	ip6_netmask(&e.ip[1], e.cidr[1]);
 
+	nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask);
+	nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask);
+	if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6))
+		return -IPSET_ERR_HASH_ELEM;
+
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
 
@@ -484,6 +498,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
 		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
 		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
 		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
+		[IPSET_ATTR_NETMASK]    = { .type = NLA_U8 },
+		[IPSET_ATTR_BITMASK]	= { .type = NLA_NESTED },
 	},
 	.adt_policy	= {
 		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
-- 
cgit v1.2.3


From c67cae551f0df80421b5703ee56ff5e2fe9c4de6 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 25 Nov 2022 14:06:17 -0800
Subject: bpf: Tighten ptr_to_btf_id checks.

The networking programs typically don't require CAP_PERFMON, but through kfuncs
like bpf_cast_to_kern_ctx() they can access memory through PTR_TO_BTF_ID. In
such case enforce CAP_PERFMON.
Also make sure that only GPL programs can access kernel data structures.
All kfuncs require GPL already.

Also remove allow_ptr_to_map_access. It's the same as allow_ptr_leaks and
different name for the same check only causes confusion.

Fixes: fd264ca02094 ("bpf: Add a kfunc to type cast from bpf uapi ctx to kernel ctx")
Fixes: 50c6b8a9aea2 ("selftests/bpf: Add a test for btf_type_tag "percpu"")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20221125220617.26846-1-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h                                     |  5 -----
 include/linux/bpf_verifier.h                            |  1 -
 kernel/bpf/verifier.c                                   | 17 ++++++++++++++---
 tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c |  1 +
 tools/testing/selftests/bpf/verifier/map_ptr.c          |  8 ++++----
 5 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 67452103bb86..4920ac252754 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1909,11 +1909,6 @@ static inline bool bpf_allow_uninit_stack(void)
 	return perfmon_capable();
 }
 
-static inline bool bpf_allow_ptr_to_map_access(void)
-{
-	return perfmon_capable();
-}
-
 static inline bool bpf_bypass_spec_v1(void)
 {
 	return perfmon_capable();
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c05aa6e1f6f5..b5090e89cb3f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -531,7 +531,6 @@ struct bpf_verifier_env {
 	bool explore_alu_limits;
 	bool allow_ptr_leaks;
 	bool allow_uninit_stack;
-	bool allow_ptr_to_map_access;
 	bool bpf_capable;
 	bool bypass_spec_v1;
 	bool bypass_spec_v4;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4e7f1d085e53..d0ecc0b18b20 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4703,6 +4703,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	u32 btf_id;
 	int ret;
 
+	if (!env->allow_ptr_leaks) {
+		verbose(env,
+			"'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+			tname);
+		return -EPERM;
+	}
+	if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
+		verbose(env,
+			"Cannot access kernel 'struct %s' from non-GPL compatible program\n",
+			tname);
+		return -EINVAL;
+	}
 	if (off < 0) {
 		verbose(env,
 			"R%d is ptr_%s invalid negative access: off=%d\n",
@@ -4823,9 +4835,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 	t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
 	tname = btf_name_by_offset(btf_vmlinux, t->name_off);
 
-	if (!env->allow_ptr_to_map_access) {
+	if (!env->allow_ptr_leaks) {
 		verbose(env,
-			"%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+			"'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
 			tname);
 		return -EPERM;
 	}
@@ -16679,7 +16691,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
 
 	env->allow_ptr_leaks = bpf_allow_ptr_leaks();
 	env->allow_uninit_stack = bpf_allow_uninit_stack();
-	env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
 	env->bypass_spec_v1 = bpf_bypass_spec_v1();
 	env->bypass_spec_v4 = bpf_bypass_spec_v4();
 	env->bpf_capable = bpf_capable();
diff --git a/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c b/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c
index 8feddb8289cf..38f78d9345de 100644
--- a/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c
+++ b/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c
@@ -64,3 +64,4 @@ int BPF_PROG(test_percpu_helper, struct cgroup *cgrp, const char *path)
 
 	return 0;
 }
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c
index 1f82021429bf..17ee84dc7766 100644
--- a/tools/testing/selftests/bpf/verifier/map_ptr.c
+++ b/tools/testing/selftests/bpf/verifier/map_ptr.c
@@ -9,7 +9,7 @@
 	},
 	.fixup_map_array_48b = { 1 },
 	.result_unpriv = REJECT,
-	.errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+	.errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
 	.result = REJECT,
 	.errstr = "R1 is bpf_array invalid negative access: off=-8",
 },
@@ -26,7 +26,7 @@
 	},
 	.fixup_map_array_48b = { 3 },
 	.result_unpriv = REJECT,
-	.errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+	.errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
 	.result = REJECT,
 	.errstr = "only read from bpf_array is supported",
 },
@@ -41,7 +41,7 @@
 	},
 	.fixup_map_array_48b = { 1 },
 	.result_unpriv = REJECT,
-	.errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+	.errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
 	.result = REJECT,
 	.errstr = "cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4",
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -57,7 +57,7 @@
 	},
 	.fixup_map_array_48b = { 1 },
 	.result_unpriv = REJECT,
-	.errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+	.errstr_unpriv = "access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
 	.result = ACCEPT,
 	.retval = 1,
 },
-- 
cgit v1.2.3


From a351d6087bf7d3d8440d58d3bf244ec64b89394a Mon Sep 17 00:00:00 2001
From: Pengcheng Yang <yangpc@wangsu.com>
Date: Tue, 29 Nov 2022 18:40:39 +0800
Subject: bpf, sockmap: Fix missing BPF_F_INGRESS flag when using apply_bytes

When redirecting, we use sk_msg_to_ingress() to get the BPF_F_INGRESS
flag from the msg->flags. If apply_bytes is used and it is larger than
the current data being processed, sk_psock_msg_verdict() will not be
called when sendmsg() is called again. At this time, the msg->flags is 0,
and we lost the BPF_F_INGRESS flag.

So we need to save the BPF_F_INGRESS flag in sk_psock and use it when
redirection.

Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support")
Signed-off-by: Pengcheng Yang <yangpc@wangsu.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/1669718441-2654-3-git-send-email-yangpc@wangsu.com
---
 include/linux/skmsg.h |  1 +
 include/net/tcp.h     |  4 ++--
 net/core/skmsg.c      |  9 ++++++---
 net/ipv4/tcp_bpf.c    | 11 ++++++-----
 net/tls/tls_sw.c      |  6 ++++--
 5 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 70d6cb94e580..84f787416a54 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -82,6 +82,7 @@ struct sk_psock {
 	u32				apply_bytes;
 	u32				cork_bytes;
 	u32				eval;
+	bool				redir_ingress; /* undefined if sk_redir is null */
 	struct sk_msg			*cork;
 	struct sk_psock_progs		progs;
 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6b814e788f00..b87e7381bddf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2319,8 +2319,8 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
 #endif /* CONFIG_BPF_SYSCALL */
 
-int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
-			  int flags);
+int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
+			  struct sk_msg *msg, u32 bytes, int flags);
 #endif /* CONFIG_NET_SOCK_MSG */
 
 #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index e6b9ced3eda8..53d0251788aa 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -886,13 +886,16 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
 	ret = sk_psock_map_verd(ret, msg->sk_redir);
 	psock->apply_bytes = msg->apply_bytes;
 	if (ret == __SK_REDIRECT) {
-		if (psock->sk_redir)
+		if (psock->sk_redir) {
 			sock_put(psock->sk_redir);
-		psock->sk_redir = msg->sk_redir;
-		if (!psock->sk_redir) {
+			psock->sk_redir = NULL;
+		}
+		if (!msg->sk_redir) {
 			ret = __SK_DROP;
 			goto out;
 		}
+		psock->redir_ingress = sk_msg_to_ingress(msg);
+		psock->sk_redir = msg->sk_redir;
 		sock_hold(psock->sk_redir);
 	}
 out:
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index f3e868f4cd9e..275c5ca9e04d 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -131,10 +131,9 @@ static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
 	return ret;
 }
 
-int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
-			  u32 bytes, int flags)
+int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
+			  struct sk_msg *msg, u32 bytes, int flags)
 {
-	bool ingress = sk_msg_to_ingress(msg);
 	struct sk_psock *psock = sk_psock_get(sk);
 	int ret;
 
@@ -276,7 +275,7 @@ msg_bytes_ready:
 static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
 				struct sk_msg *msg, int *copied, int flags)
 {
-	bool cork = false, enospc = sk_msg_full(msg);
+	bool cork = false, enospc = sk_msg_full(msg), redir_ingress;
 	struct sock *sk_redir;
 	u32 tosend, origsize, sent, delta = 0;
 	u32 eval;
@@ -322,6 +321,7 @@ more_data:
 		sk_msg_apply_bytes(psock, tosend);
 		break;
 	case __SK_REDIRECT:
+		redir_ingress = psock->redir_ingress;
 		sk_redir = psock->sk_redir;
 		sk_msg_apply_bytes(psock, tosend);
 		if (!psock->apply_bytes) {
@@ -338,7 +338,8 @@ more_data:
 		release_sock(sk);
 
 		origsize = msg->sg.size;
-		ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+		ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress,
+					    msg, tosend, flags);
 		sent = origsize - msg->sg.size;
 
 		if (eval == __SK_REDIRECT)
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 264cf367e265..9ed978634125 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -792,7 +792,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
 	struct sk_psock *psock;
 	struct sock *sk_redir;
 	struct tls_rec *rec;
-	bool enospc, policy;
+	bool enospc, policy, redir_ingress;
 	int err = 0, send;
 	u32 delta = 0;
 
@@ -837,6 +837,7 @@ more_data:
 		}
 		break;
 	case __SK_REDIRECT:
+		redir_ingress = psock->redir_ingress;
 		sk_redir = psock->sk_redir;
 		memcpy(&msg_redir, msg, sizeof(*msg));
 		if (msg->apply_bytes < send)
@@ -846,7 +847,8 @@ more_data:
 		sk_msg_return_zero(sk, msg, send);
 		msg->sg.size -= send;
 		release_sock(sk);
-		err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags);
+		err = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress,
+					    &msg_redir, send, flags);
 		lock_sock(sk);
 		if (err < 0) {
 			*copied -= sk_msg_free_nocharge(sk, &msg_redir);
-- 
cgit v1.2.3


From 3144bfa5078e0df7507a4de72061501e6a0e56be Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 29 Nov 2022 21:21:47 -0800
Subject: bpf: Fix a compilation failure with clang lto build

When building the kernel with clang lto (CONFIG_LTO_CLANG_FULL=y), the
following compilation error will appear:

  $ make LLVM=1 LLVM_IAS=1 -j
  ...
  ld.lld: error: ld-temp.o <inline asm>:26889:1: symbol 'cgroup_storage_map_btf_ids' is already defined
  cgroup_storage_map_btf_ids:;
  ^
  make[1]: *** [/.../bpf-next/scripts/Makefile.vmlinux_o:61: vmlinux.o] Error 1

In local_storage.c, we have
  BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map)
Commit c4bcfb38a95e ("bpf: Implement cgroup storage available to
non-cgroup-attached bpf progs") added the above identical BTF_ID_LIST_SINGLE
definition in bpf_cgrp_storage.c. With duplicated definitions, llvm linker
complains with lto build.

Also, extracting btf_id of 'struct bpf_local_storage_map' is defined four times
for sk, inode, task and cgrp local storages. Let us define a single global one
with a different name than cgroup_storage_map_btf_ids, which also fixed
the lto compilation error.

Fixes: c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20221130052147.1591625-1-yhs@fb.com
---
 include/linux/btf_ids.h        | 1 +
 kernel/bpf/bpf_cgrp_storage.c  | 3 +--
 kernel/bpf/bpf_inode_storage.c | 4 +---
 kernel/bpf/bpf_task_storage.c  | 4 ++--
 net/core/bpf_sk_storage.c      | 3 +--
 5 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 93397711a68c..3a4f7cd882ca 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -266,5 +266,6 @@ MAX_BTF_TRACING_TYPE,
 
 extern u32 btf_tracing_ids[];
 extern u32 bpf_cgroup_btf_id[];
+extern u32 bpf_local_storage_map_btf_id[];
 
 #endif
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 309403800f82..6cdf6d9ed91d 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -211,7 +211,6 @@ BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgro
 	return ret;
 }
 
-BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map)
 const struct bpf_map_ops cgrp_storage_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -222,7 +221,7 @@ const struct bpf_map_ops cgrp_storage_map_ops = {
 	.map_update_elem = bpf_cgrp_storage_update_elem,
 	.map_delete_elem = bpf_cgrp_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
-	.map_btf_id = &cgroup_storage_map_btf_ids[0],
+	.map_btf_id = &bpf_local_storage_map_btf_id[0],
 	.map_owner_storage_ptr = cgroup_storage_ptr,
 };
 
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 6a1d4d22816a..05f4c66c9089 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -213,8 +213,6 @@ static void inode_storage_map_free(struct bpf_map *map)
 	bpf_local_storage_map_free(map, &inode_cache, NULL);
 }
 
-BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct,
-		   bpf_local_storage_map)
 const struct bpf_map_ops inode_storage_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -225,7 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = {
 	.map_update_elem = bpf_fd_inode_storage_update_elem,
 	.map_delete_elem = bpf_fd_inode_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
-	.map_btf_id = &inode_storage_map_btf_ids[0],
+	.map_btf_id = &bpf_local_storage_map_btf_id[0],
 	.map_owner_storage_ptr = inode_storage_ptr,
 };
 
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 8e832db8151a..1e486055a523 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -324,7 +324,7 @@ static void task_storage_map_free(struct bpf_map *map)
 	bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
 }
 
-BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map)
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
 const struct bpf_map_ops task_storage_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -335,7 +335,7 @@ const struct bpf_map_ops task_storage_map_ops = {
 	.map_update_elem = bpf_pid_task_storage_update_elem,
 	.map_delete_elem = bpf_pid_task_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
-	.map_btf_id = &task_storage_map_btf_ids[0],
+	.map_btf_id = &bpf_local_storage_map_btf_id[0],
 	.map_owner_storage_ptr = task_storage_ptr,
 };
 
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 9d2288c0736e..bb378c33f542 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -310,7 +310,6 @@ bpf_sk_storage_ptr(void *owner)
 	return &sk->sk_bpf_storage;
 }
 
-BTF_ID_LIST_SINGLE(sk_storage_map_btf_ids, struct, bpf_local_storage_map)
 const struct bpf_map_ops sk_storage_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -321,7 +320,7 @@ const struct bpf_map_ops sk_storage_map_ops = {
 	.map_update_elem = bpf_fd_sk_storage_update_elem,
 	.map_delete_elem = bpf_fd_sk_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
-	.map_btf_id = &sk_storage_map_btf_ids[0],
+	.map_btf_id = &bpf_local_storage_map_btf_id[0],
 	.map_local_storage_charge = bpf_sk_storage_charge,
 	.map_local_storage_uncharge = bpf_sk_storage_uncharge,
 	.map_owner_storage_ptr = bpf_sk_storage_ptr,
-- 
cgit v1.2.3


From bc66fa87d4fda9053a8145e5718fc278c2b88253 Mon Sep 17 00:00:00 2001
From: Xiaolei Wang <xiaolei.wang@windriver.com>
Date: Wed, 30 Nov 2022 10:12:16 +0800
Subject: net: phy: Add link between phy dev and mac dev

If the external phy used by current mac interface is
managed by another mac interface, it means that this
network port cannot work independently, especially
when the system suspends and resumes, the following
trace may appear, so we should create a device link
between phy dev and mac dev.

  WARNING: CPU: 0 PID: 24 at drivers/net/phy/phy.c:983 phy_error+0x20/0x68
  Modules linked in:
  CPU: 0 PID: 24 Comm: kworker/0:2 Not tainted 6.1.0-rc3-00011-g5aaef24b5c6d-dirty #34
  Hardware name: Freescale i.MX6 SoloX (Device Tree)
  Workqueue: events_power_efficient phy_state_machine
  unwind_backtrace from show_stack+0x10/0x14
  show_stack from dump_stack_lvl+0x68/0x90
  dump_stack_lvl from __warn+0xb4/0x24c
  __warn from warn_slowpath_fmt+0x5c/0xd8
  warn_slowpath_fmt from phy_error+0x20/0x68
  phy_error from phy_state_machine+0x22c/0x23c
  phy_state_machine from process_one_work+0x288/0x744
  process_one_work from worker_thread+0x3c/0x500
  worker_thread from kthread+0xf0/0x114
  kthread from ret_from_fork+0x14/0x28
  Exception stack(0xf0951fb0 to 0xf0951ff8)

Signed-off-by: Xiaolei Wang <xiaolei.wang@windriver.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20221130021216.1052230-1-xiaolei.wang@windriver.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 12 ++++++++++++
 include/linux/phy.h          |  4 ++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 8cff61dbc4b5..716870a4499c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1512,6 +1512,15 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 	phy_resume(phydev);
 	phy_led_triggers_register(phydev);
 
+	/**
+	 * If the external phy used by current mac interface is managed by
+	 * another mac interface, so we should create a device link between
+	 * phy dev and mac dev.
+	 */
+	if (phydev->mdio.bus->parent && dev->dev.parent != phydev->mdio.bus->parent)
+		phydev->devlink = device_link_add(dev->dev.parent, &phydev->mdio.dev,
+						  DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS);
+
 	return err;
 
 error:
@@ -1750,6 +1759,9 @@ void phy_detach(struct phy_device *phydev)
 	struct module *ndev_owner = NULL;
 	struct mii_bus *bus;
 
+	if (phydev->devlink)
+		device_link_del(phydev->devlink);
+
 	if (phydev->sysfs_links) {
 		if (dev)
 			sysfs_remove_link(&dev->dev.kobj, "phydev");
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9a3752c0c444..71eeb4e3b1fd 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -529,6 +529,8 @@ struct macsec_ops;
  *
  * @mdio: MDIO bus this PHY is on
  * @drv: Pointer to the driver for this PHY instance
+ * @devlink: Create a link between phy dev and mac dev, if the external phy
+ *           used by current mac interface is managed by another mac interface.
  * @phy_id: UID for this device found during discovery
  * @c45_ids: 802.3-c45 Device Identifiers if is_c45.
  * @is_c45:  Set to true if this PHY uses clause 45 addressing.
@@ -618,6 +620,8 @@ struct phy_device {
 	/* And management functions */
 	struct phy_driver *drv;
 
+	struct device_link *devlink;
+
 	u32 phy_id;
 
 	struct phy_c45_device_ids c45_ids;
-- 
cgit v1.2.3


From 278ab9793116a8531ffaa52b4e61644971131e35 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 30 Nov 2022 13:26:45 -0800
Subject: wifi: ieee80211: Do not open-code qos address offsets

When building with -Wstringop-overflow, GCC's KASAN implementation does
not correctly perform bounds checking within some complex structures
when faced with literal offsets, and can get very confused. For example,
this warning is seen due to literal offsets into sturct ieee80211_hdr
that may or may not be large enough:

drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c: In function 'iwl_mvm_rx_mpdu_mq':
drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:2022:29: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
 2022 |                         *qc &= ~IEEE80211_QOS_CTL_A_MSDU_PRESENT;
In file included from drivers/net/wireless/intel/iwlwifi/mvm/fw-api.h:32,
                 from drivers/net/wireless/intel/iwlwifi/mvm/sta.h:15,
                 from drivers/net/wireless/intel/iwlwifi/mvm/mvm.h:27,
                 from drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:10:
drivers/net/wireless/intel/iwlwifi/mvm/../fw/api/rx.h:559:16: note: at offset [78, 166] into destination object 'mpdu_len' of size 2
  559 |         __le16 mpdu_len;
      |                ^~~~~~~~

Refactor ieee80211_get_qos_ctl() to avoid using literal offsets,
requiring the creation of the actual structure that is described in the
comments. Explicitly choose the desired offset, making the code more
human-readable too. This is one of the last remaining warning to fix
before enabling -Wstringop-overflow globally.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97490
Link: https://github.com/KSPP/linux/issues/181
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: Gregory Greenman <gregory.greenman@intel.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: linux-wireless@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20221130212641.never.627-kees@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 6252f02f38b7..80d6308dea06 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -338,6 +338,17 @@ struct ieee80211_qos_hdr {
 	__le16 qos_ctrl;
 } __packed __aligned(2);
 
+struct ieee80211_qos_hdr_4addr {
+	__le16 frame_control;
+	__le16 duration_id;
+	u8 addr1[ETH_ALEN];
+	u8 addr2[ETH_ALEN];
+	u8 addr3[ETH_ALEN];
+	__le16 seq_ctrl;
+	u8 addr4[ETH_ALEN];
+	__le16 qos_ctrl;
+} __packed __aligned(2);
+
 struct ieee80211_trigger {
 	__le16 frame_control;
 	__le16 duration;
@@ -4060,16 +4071,21 @@ struct ieee80211_he_6ghz_capa {
  * @hdr: the frame
  *
  * The qos ctrl bytes come after the frame_control, duration, seq_num
- * and 3 or 4 addresses of length ETH_ALEN.
- * 3 addr: 2 + 2 + 2 + 3*6 = 24
- * 4 addr: 2 + 2 + 2 + 4*6 = 30
+ * and 3 or 4 addresses of length ETH_ALEN. Checks frame_control to choose
+ * between struct ieee80211_qos_hdr_4addr and struct ieee80211_qos_hdr.
  */
 static inline u8 *ieee80211_get_qos_ctl(struct ieee80211_hdr *hdr)
 {
-	if (ieee80211_has_a4(hdr->frame_control))
-		return (u8 *)hdr + 30;
+	union {
+		struct ieee80211_qos_hdr	addr3;
+		struct ieee80211_qos_hdr_4addr	addr4;
+	} *qos;
+
+	qos = (void *)hdr;
+	if (ieee80211_has_a4(qos->addr3.frame_control))
+		return (u8 *)&qos->addr4.qos_ctrl;
 	else
-		return (u8 *)hdr + 24;
+		return (u8 *)&qos->addr3.qos_ctrl;
 }
 
 /**
-- 
cgit v1.2.3


From eb8c507296f6038d46010396d91b42a05c3b64d9 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Wed, 23 Nov 2022 17:38:55 +0000
Subject: jump_label: Prevent key->enabled int overflow

1. With CONFIG_JUMP_LABEL=n static_key_slow_inc() doesn't have any
   protection against key->enabled refcounter overflow.
2. With CONFIG_JUMP_LABEL=y static_key_slow_inc_cpuslocked()
   still may turn the refcounter negative as (v + 1) may overflow.

key->enabled is indeed a ref-counter as it's documented in multiple
places: top comment in jump_label.h, Documentation/staging/static-keys.rst,
etc.

As -1 is reserved for static key that's in process of being enabled,
functions would break with negative key->enabled refcount:
- for CONFIG_JUMP_LABEL=n negative return of static_key_count()
  breaks static_key_false(), static_key_true()
- the ref counter may become 0 from negative side by too many
  static_key_slow_inc() calls and lead to use-after-free issues.

These flaws result in that some users have to introduce an additional
mutex and prevent the reference counter from overflowing themselves,
see bpf_enable_runtime_stats() checking the counter against INT_MAX / 2.

Prevent the reference counter overflow by checking if (v + 1) > 0.
Change functions API to return whether the increment was successful.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/jump_label.h | 21 +++++++++++++----
 kernel/jump_label.c        | 56 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 61 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 570831ca9951..4e968ebadce6 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -224,9 +224,10 @@ extern bool arch_jump_label_transform_queue(struct jump_entry *entry,
 					    enum jump_label_type type);
 extern void arch_jump_label_transform_apply(void);
 extern int jump_label_text_reserved(void *start, void *end);
-extern void static_key_slow_inc(struct static_key *key);
+extern bool static_key_slow_inc(struct static_key *key);
+extern bool static_key_fast_inc_not_disabled(struct static_key *key);
 extern void static_key_slow_dec(struct static_key *key);
-extern void static_key_slow_inc_cpuslocked(struct static_key *key);
+extern bool static_key_slow_inc_cpuslocked(struct static_key *key);
 extern void static_key_slow_dec_cpuslocked(struct static_key *key);
 extern int static_key_count(struct static_key *key);
 extern void static_key_enable(struct static_key *key);
@@ -278,11 +279,23 @@ static __always_inline bool static_key_true(struct static_key *key)
 	return false;
 }
 
-static inline void static_key_slow_inc(struct static_key *key)
+static inline bool static_key_fast_inc_not_disabled(struct static_key *key)
 {
+	int v;
+
 	STATIC_KEY_CHECK_USE(key);
-	atomic_inc(&key->enabled);
+	/*
+	 * Prevent key->enabled getting negative to follow the same semantics
+	 * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment.
+	 */
+	v = atomic_read(&key->enabled);
+	do {
+		if (v < 0 || (v + 1) < 0)
+			return false;
+	} while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
+	return true;
 }
+#define static_key_slow_inc(key)	static_key_fast_inc_not_disabled(key)
 
 static inline void static_key_slow_dec(struct static_key *key)
 {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 4d6c6f5f60db..d9c822bbffb8 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -113,9 +113,40 @@ int static_key_count(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_count);
 
-void static_key_slow_inc_cpuslocked(struct static_key *key)
+/*
+ * static_key_fast_inc_not_disabled - adds a user for a static key
+ * @key: static key that must be already enabled
+ *
+ * The caller must make sure that the static key can't get disabled while
+ * in this function. It doesn't patch jump labels, only adds a user to
+ * an already enabled static key.
+ *
+ * Returns true if the increment was done. Unlike refcount_t the ref counter
+ * is not saturated, but will fail to increment on overflow.
+ */
+bool static_key_fast_inc_not_disabled(struct static_key *key)
 {
+	int v;
+
 	STATIC_KEY_CHECK_USE(key);
+	/*
+	 * Negative key->enabled has a special meaning: it sends
+	 * static_key_slow_inc() down the slow path, and it is non-zero
+	 * so it counts as "enabled" in jump_label_update().  Note that
+	 * atomic_inc_unless_negative() checks >= 0, so roll our own.
+	 */
+	v = atomic_read(&key->enabled);
+	do {
+		if (v <= 0 || (v + 1) < 0)
+			return false;
+	} while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled);
+
+bool static_key_slow_inc_cpuslocked(struct static_key *key)
+{
 	lockdep_assert_cpus_held();
 
 	/*
@@ -124,15 +155,9 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
 	 * jump_label_update() process.  At the same time, however,
 	 * the jump_label_update() call below wants to see
 	 * static_key_enabled(&key) for jumps to be updated properly.
-	 *
-	 * So give a special meaning to negative key->enabled: it sends
-	 * static_key_slow_inc() down the slow path, and it is non-zero
-	 * so it counts as "enabled" in jump_label_update().  Note that
-	 * atomic_inc_unless_negative() checks >= 0, so roll our own.
 	 */
-	for (int v = atomic_read(&key->enabled); v > 0; )
-		if (likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)))
-			return;
+	if (static_key_fast_inc_not_disabled(key))
+		return true;
 
 	jump_label_lock();
 	if (atomic_read(&key->enabled) == 0) {
@@ -144,16 +169,23 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
 		 */
 		atomic_set_release(&key->enabled, 1);
 	} else {
-		atomic_inc(&key->enabled);
+		if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) {
+			jump_label_unlock();
+			return false;
+		}
 	}
 	jump_label_unlock();
+	return true;
 }
 
-void static_key_slow_inc(struct static_key *key)
+bool static_key_slow_inc(struct static_key *key)
 {
+	bool ret;
+
 	cpus_read_lock();
-	static_key_slow_inc_cpuslocked(key);
+	ret = static_key_slow_inc_cpuslocked(key);
 	cpus_read_unlock();
+	return ret;
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
 
-- 
cgit v1.2.3


From d93607082e982223cf92750f2d9039ff365b9d24 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 30 Nov 2022 23:28:26 +0100
Subject: net: add netdev_sw_irq_coalesce_default_on()

Add a helper for drivers wanting to set SW IRQ coalescing
by default. The related sysfs attributes can be used to
override the default values.

Follow Jakub's suggestion and put this functionality into
net core so that drivers wanting to use software interrupt
coalescing per default don't have to open-code it.

Note that this function needs to be called before the
netdevice is registered.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5aa35c58c342..f78db610ada5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -78,6 +78,7 @@ struct xdp_buff;
 void synchronize_net(void);
 void netdev_set_default_ethtool_ops(struct net_device *dev,
 				    const struct ethtool_ops *ops);
+void netdev_sw_irq_coalesce_default_on(struct net_device *dev);
 
 /* Backlog congestion levels */
 #define NET_RX_SUCCESS		0	/* keep 'em coming, baby */
diff --git a/net/core/dev.c b/net/core/dev.c
index 7627c475d991..b76fb37b381e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10517,6 +10517,22 @@ void netdev_set_default_ethtool_ops(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
 
+/**
+ * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
+ * @dev: netdev to enable the IRQ coalescing on
+ *
+ * Sets a conservative default for SW IRQ coalescing. Users can use
+ * sysfs attributes to override the default values.
+ */
+void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
+{
+	WARN_ON(dev->reg_state == NETREG_REGISTERED);
+
+	dev->gro_flush_timeout = 20000;
+	dev->napi_defer_hard_irqs = 1;
+}
+EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
+
 void netdev_freemem(struct net_device *dev)
 {
 	char *addr = (char *)dev - dev->padded;
-- 
cgit v1.2.3


From fca1aa75518c03b04c3c249e9a9134faf9ca18c5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 3 Dec 2022 10:46:02 -0800
Subject: bpf: Handle MEM_RCU type properly

Commit 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()")
introduced MEM_RCU and bpf_rcu_read_lock/unlock() support. In that
commit, a rcu pointer is tagged with both MEM_RCU and PTR_TRUSTED
so that it can be passed into kfuncs or helpers as an argument.

Martin raised a good question in [1] such that the rcu pointer,
although being able to accessing the object, might have reference
count of 0. This might cause a problem if the rcu pointer is passed
to a kfunc which expects trusted arguments where ref count should
be greater than 0.

This patch makes the following changes related to MEM_RCU pointer:
  - MEM_RCU pointer might be NULL (PTR_MAYBE_NULL).
  - Introduce KF_RCU so MEM_RCU ptr can be acquired with
    a KF_RCU tagged kfunc which assumes ref count of rcu ptr
    could be zero.
  - For mem access 'b = ptr->a', say 'ptr' is a MEM_RCU ptr, and
    'a' is tagged with __rcu as well. Let us mark 'b' as
    MEM_RCU | PTR_MAYBE_NULL.

 [1] https://lore.kernel.org/bpf/ac70f574-4023-664e-b711-e0d3b18117fd@linux.dev/

Fixes: 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()")
Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20221203184602.477272-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  2 +-
 include/linux/btf.h          |  1 +
 kernel/bpf/helpers.c         | 14 ++++++++++++++
 kernel/bpf/verifier.c        | 45 +++++++++++++++++++++++++++++++-------------
 4 files changed, 48 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b5090e89cb3f..c07b351a5bc7 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -682,7 +682,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
 	}
 }
 
-#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED)
+#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED)
 
 static inline bool bpf_type_has_unsafe_modifiers(u32 type)
 {
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 9ed00077db6e..cbd6e4096f8c 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -70,6 +70,7 @@
 #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
+#define KF_RCU          (1 << 7) /* kfunc only takes rcu pointer arguments */
 
 /*
  * Return the name of the passed struct, if exists, or halt the build if for
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a5a511430f2a..cca642358e80 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1837,6 +1837,19 @@ struct task_struct *bpf_task_acquire(struct task_struct *p)
 	return p;
 }
 
+/**
+ * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task
+ * acquired by this kfunc which is not stored in a map as a kptr, must be
+ * released by calling bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
+{
+	if (!refcount_inc_not_zero(&p->rcu_users))
+		return NULL;
+	return p;
+}
+
 /**
  * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task
  * kptr acquired by this kfunc which is not subsequently stored in a map, must
@@ -2013,6 +2026,7 @@ BTF_ID_FLAGS(func, bpf_list_push_back)
 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
 #ifdef CONFIG_CGROUPS
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b0db9c10567b..66b82f52b5bc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4275,7 +4275,7 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg)
 		return true;
 
 	/* If a register is not referenced, it is trusted if it has the
-	 * MEM_ALLOC, MEM_RCU or PTR_TRUSTED type modifiers, and no others. Some of the
+	 * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
 	 * other type modifiers may be safe, but we elect to take an opt-in
 	 * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
 	 * not.
@@ -4287,6 +4287,11 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg)
 	       !bpf_type_has_unsafe_modifiers(reg->type);
 }
 
+static bool is_rcu_reg(const struct bpf_reg_state *reg)
+{
+	return reg->type & MEM_RCU;
+}
+
 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
@@ -4785,14 +4790,16 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 
 	if (flag & MEM_RCU) {
 		/* Mark value register as MEM_RCU only if it is protected by
-		 * bpf_rcu_read_lock() and the ptr reg is trusted. MEM_RCU
+		 * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU
 		 * itself can already indicate trustedness inside the rcu
-		 * read lock region. Also mark it as PTR_TRUSTED.
+		 * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
+		 * it could be null in some cases.
 		 */
-		if (!env->cur_state->active_rcu_lock || !is_trusted_reg(reg))
+		if (!env->cur_state->active_rcu_lock ||
+		    !(is_trusted_reg(reg) || is_rcu_reg(reg)))
 			flag &= ~MEM_RCU;
 		else
-			flag |= PTR_TRUSTED;
+			flag |= PTR_MAYBE_NULL;
 	} else if (reg->type & MEM_RCU) {
 		/* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
 		 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
@@ -5957,7 +5964,7 @@ static const struct bpf_reg_types btf_ptr_types = {
 	.types = {
 		PTR_TO_BTF_ID,
 		PTR_TO_BTF_ID | PTR_TRUSTED,
-		PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED,
+		PTR_TO_BTF_ID | MEM_RCU,
 	},
 };
 static const struct bpf_reg_types percpu_btf_ptr_types = {
@@ -6136,7 +6143,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	case PTR_TO_BTF_ID:
 	case PTR_TO_BTF_ID | MEM_ALLOC:
 	case PTR_TO_BTF_ID | PTR_TRUSTED:
-	case PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED:
+	case PTR_TO_BTF_ID | MEM_RCU:
 	case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
 		/* When referenced PTR_TO_BTF_ID is passed to release function,
 		 * it's fixed offset must be 0.	In the other cases, fixed offset
@@ -8038,6 +8045,11 @@ static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
 	return meta->kfunc_flags & KF_DESTRUCTIVE;
 }
 
+static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_RCU;
+}
+
 static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
 {
 	return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
@@ -8722,13 +8734,20 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		switch (kf_arg_type) {
 		case KF_ARG_PTR_TO_ALLOC_BTF_ID:
 		case KF_ARG_PTR_TO_BTF_ID:
-			if (!is_kfunc_trusted_args(meta))
+			if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
 				break;
 
 			if (!is_trusted_reg(reg)) {
-				verbose(env, "R%d must be referenced or trusted\n", regno);
-				return -EINVAL;
+				if (!is_kfunc_rcu(meta)) {
+					verbose(env, "R%d must be referenced or trusted\n", regno);
+					return -EINVAL;
+				}
+				if (!is_rcu_reg(reg)) {
+					verbose(env, "R%d must be a rcu pointer\n", regno);
+					return -EINVAL;
+				}
 			}
+
 			fallthrough;
 		case KF_ARG_PTR_TO_CTX:
 			/* Trusted arguments have the same offset checks as release arguments */
@@ -8839,7 +8858,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		case KF_ARG_PTR_TO_BTF_ID:
 			/* Only base_type is checked, further checks are done here */
 			if ((base_type(reg->type) != PTR_TO_BTF_ID ||
-			     bpf_type_has_unsafe_modifiers(reg->type)) &&
+			     (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
 			    !reg2btf_ids[base_type(reg->type)]) {
 				verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
 				verbose(env, "expected %s or socket\n",
@@ -8954,7 +8973,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		} else if (rcu_unlock) {
 			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
 				if (reg->type & MEM_RCU) {
-					reg->type &= ~(MEM_RCU | PTR_TRUSTED);
+					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
 					reg->type |= PTR_UNTRUSTED;
 				}
 			}));
@@ -11294,7 +11313,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 				 bool is_null)
 {
 	if (type_may_be_null(reg->type) && reg->id == id &&
-	    !WARN_ON_ONCE(!reg->id)) {
+	    (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
 		/* Old offset (both fixed and variable parts) should have been
 		 * known-zero, because we don't allow pointer arithmetic on
 		 * pointers that might be NULL. If we see this happening, don't
-- 
cgit v1.2.3


From c0c852dd1876dc1db4600ce951a92aadd3073b1c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 3 Dec 2022 12:49:54 -0800
Subject: bpf: Do not mark certain LSM hook arguments as trusted

Martin mentioned that the verifier cannot assume arguments from
LSM hook sk_alloc_security being trusted since after the hook
is called, the sk ref_count is set to 1. This will overwrite
the ref_count changed by the bpf program and may cause ref_count
underflow later on.

I then further checked some other hooks. For example,
for bpf_lsm_file_alloc() hook in fs/file_table.c,

        f->f_cred = get_cred(cred);
        error = security_file_alloc(f);
        if (unlikely(error)) {
                file_free_rcu(&f->f_rcuhead);
                return ERR_PTR(error);
        }

        atomic_long_set(&f->f_count, 1);

The input parameter 'f' to security_file_alloc() cannot be trusted
as well.

Specifically, I investiaged bpf_map/bpf_prog/file/sk/task alloc/free
lsm hooks. Except bpf_map_alloc and task_alloc, arguments for all other
hooks should not be considered as trusted. This may not be a complete
list, but it covers common usage for sk and task.

Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs")
Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20221203204954.2043348-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_lsm.h                                |  6 ++++++
 kernel/bpf/bpf_lsm.c                                   | 16 ++++++++++++++++
 kernel/bpf/btf.c                                       |  2 ++
 tools/testing/selftests/bpf/prog_tests/task_kfunc.c    |  1 +
 tools/testing/selftests/bpf/progs/task_kfunc_failure.c | 11 +++++++++++
 5 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 4bcf76a9bb06..1de7ece5d36d 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -28,6 +28,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 			const struct bpf_prog *prog);
 
 bool bpf_lsm_is_sleepable_hook(u32 btf_id);
+bool bpf_lsm_is_trusted(const struct bpf_prog *prog);
 
 static inline struct bpf_storage_blob *bpf_inode(
 	const struct inode *inode)
@@ -51,6 +52,11 @@ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id)
 	return false;
 }
 
+static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
+{
+	return false;
+}
+
 static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 				      const struct bpf_prog *prog)
 {
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index ae0267f150b5..9ea42a45da47 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -345,11 +345,27 @@ BTF_ID(func, bpf_lsm_task_to_inode)
 BTF_ID(func, bpf_lsm_userns_create)
 BTF_SET_END(sleepable_lsm_hooks)
 
+BTF_SET_START(untrusted_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_file_alloc_security)
+BTF_ID(func, bpf_lsm_file_free_security)
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+BTF_ID(func, bpf_lsm_task_free)
+BTF_SET_END(untrusted_lsm_hooks)
+
 bool bpf_lsm_is_sleepable_hook(u32 btf_id)
 {
 	return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
 }
 
+bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
+{
+	return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id);
+}
+
 const struct bpf_prog_ops lsm_prog_ops = {
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d11cbf8cece7..c80bd8709e69 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -19,6 +19,7 @@
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
 #include <linux/btf_ids.h>
+#include <linux/bpf_lsm.h>
 #include <linux/skmsg.h>
 #include <linux/perf_event.h>
 #include <linux/bsearch.h>
@@ -5829,6 +5830,7 @@ static bool prog_args_trusted(const struct bpf_prog *prog)
 	case BPF_PROG_TYPE_TRACING:
 		return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER;
 	case BPF_PROG_TYPE_LSM:
+		return bpf_lsm_is_trusted(prog);
 	case BPF_PROG_TYPE_STRUCT_OPS:
 		return true;
 	default:
diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
index ffd8ef4303c8..18848c31e36f 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
@@ -103,6 +103,7 @@ static struct {
 	{"task_kfunc_release_null", "arg#0 is ptr_or_null_ expected ptr_ or socket"},
 	{"task_kfunc_release_unacquired", "release kernel function bpf_task_release expects"},
 	{"task_kfunc_from_pid_no_null_check", "arg#0 is ptr_or_null_ expected ptr_ or socket"},
+	{"task_kfunc_from_lsm_task_free", "reg type unsupported for arg#0 function"},
 };
 
 static void verify_fail(const char *prog_name, const char *expected_err_msg)
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
index e310473190d5..87fa1db9d9b5 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
@@ -271,3 +271,14 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl
 
 	return 0;
 }
+
+SEC("lsm/task_free")
+int BPF_PROG(task_kfunc_from_lsm_task_free, struct task_struct *task)
+{
+	struct task_struct *acquired;
+
+	/* the argument of lsm task_free hook is untrusted. */
+	acquired = bpf_task_acquire(task);
+	bpf_task_release(acquired);
+	return 0;
+}
-- 
cgit v1.2.3


From 919e43fad5163a8ceb39826ecdee897a9f799351 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Fri, 2 Dec 2022 20:41:29 +0200
Subject: xfrm: add an interface to offload policy

Extend netlink interface to add and delete XFRM policy from the device.
This functionality is a first step to implement packet IPsec offload solution.

Signed-off-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h |  3 +++
 include/net/xfrm.h        | 45 +++++++++++++++++++++++++++++++
 net/xfrm/xfrm_device.c    | 67 ++++++++++++++++++++++++++++++++++++++++++++-
 net/xfrm/xfrm_policy.c    | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_user.c      | 18 +++++++++++++
 5 files changed, 201 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5aa35c58c342..4096e3fe8e4a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1040,6 +1040,9 @@ struct xfrmdev_ops {
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
 	void	(*xdo_dev_state_advance_esn) (struct xfrm_state *x);
+	int	(*xdo_dev_policy_add) (struct xfrm_policy *x);
+	void	(*xdo_dev_policy_delete) (struct xfrm_policy *x);
+	void	(*xdo_dev_policy_free) (struct xfrm_policy *x);
 };
 #endif
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index b39d24fa2ef0..6fea34cbdf48 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -129,6 +129,7 @@ struct xfrm_state_walk {
 enum {
 	XFRM_DEV_OFFLOAD_IN = 1,
 	XFRM_DEV_OFFLOAD_OUT,
+	XFRM_DEV_OFFLOAD_FWD,
 };
 
 enum {
@@ -541,6 +542,8 @@ struct xfrm_policy {
 	struct xfrm_tmpl       	xfrm_vec[XFRM_MAX_DEPTH];
 	struct hlist_node	bydst_inexact_list;
 	struct rcu_head		rcu;
+
+	struct xfrm_dev_offload xdo;
 };
 
 static inline struct net *xp_net(const struct xfrm_policy *xp)
@@ -1585,6 +1588,8 @@ struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq);
 int xfrm_state_delete(struct xfrm_state *x);
 int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync);
 int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
+int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
+			  bool task_valid);
 void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si);
 void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
 u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
@@ -1899,6 +1904,9 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
 int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		       struct xfrm_user_offload *xuo,
 		       struct netlink_ext_ack *extack);
+int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+			struct xfrm_user_offload *xuo, u8 dir,
+			struct netlink_ext_ack *extack);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
 
 static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
@@ -1947,6 +1955,28 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
 		netdev_put(dev, &xso->dev_tracker);
 	}
 }
+
+static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
+{
+	struct xfrm_dev_offload *xdo = &x->xdo;
+	struct net_device *dev = xdo->dev;
+
+	if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_policy_delete)
+		dev->xfrmdev_ops->xdo_dev_policy_delete(x);
+}
+
+static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
+{
+	struct xfrm_dev_offload *xdo = &x->xdo;
+	struct net_device *dev = xdo->dev;
+
+	if (dev && dev->xfrmdev_ops) {
+		if (dev->xfrmdev_ops->xdo_dev_policy_free)
+			dev->xfrmdev_ops->xdo_dev_policy_free(x);
+		xdo->dev = NULL;
+		netdev_put(dev, &xdo->dev_tracker);
+	}
+}
 #else
 static inline void xfrm_dev_resume(struct sk_buff *skb)
 {
@@ -1974,6 +2004,21 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
 {
 }
 
+static inline int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+				      struct xfrm_user_offload *xuo, u8 dir,
+				      struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
+{
+}
+
+static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
+{
+}
+
 static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 {
 	return false;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 3184b2c394b6..04ae510dcc66 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -325,6 +325,69 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 }
 EXPORT_SYMBOL_GPL(xfrm_dev_state_add);
 
+int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+			struct xfrm_user_offload *xuo, u8 dir,
+			struct netlink_ext_ack *extack)
+{
+	struct xfrm_dev_offload *xdo = &xp->xdo;
+	struct net_device *dev;
+	int err;
+
+	if (!xuo->flags || xuo->flags & ~XFRM_OFFLOAD_PACKET) {
+		/* We support only packet offload mode and it means
+		 * that user must set XFRM_OFFLOAD_PACKET bit.
+		 */
+		NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
+		return -EINVAL;
+	}
+
+	dev = dev_get_by_index(net, xuo->ifindex);
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_policy_add) {
+		xdo->dev = NULL;
+		dev_put(dev);
+		NL_SET_ERR_MSG(extack, "Policy offload is not supported");
+		return -EINVAL;
+	}
+
+	xdo->dev = dev;
+	netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC);
+	xdo->real_dev = dev;
+	xdo->type = XFRM_DEV_OFFLOAD_PACKET;
+	switch (dir) {
+	case XFRM_POLICY_IN:
+		xdo->dir = XFRM_DEV_OFFLOAD_IN;
+		break;
+	case XFRM_POLICY_OUT:
+		xdo->dir = XFRM_DEV_OFFLOAD_OUT;
+		break;
+	case XFRM_POLICY_FWD:
+		xdo->dir = XFRM_DEV_OFFLOAD_FWD;
+		break;
+	default:
+		xdo->dev = NULL;
+		dev_put(dev);
+		NL_SET_ERR_MSG(extack, "Unrecognized oflload direction");
+		return -EINVAL;
+	}
+
+	err = dev->xfrmdev_ops->xdo_dev_policy_add(xp);
+	if (err) {
+		xdo->dev = NULL;
+		xdo->real_dev = NULL;
+		xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+		xdo->dir = 0;
+		netdev_put(dev, &xdo->dev_tracker);
+		NL_SET_ERR_MSG(extack, "Device failed to offload this policy");
+		return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xfrm_dev_policy_add);
+
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 {
 	int mtu;
@@ -427,8 +490,10 @@ static int xfrm_api_check(struct net_device *dev)
 
 static int xfrm_dev_down(struct net_device *dev)
 {
-	if (dev->features & NETIF_F_HW_ESP)
+	if (dev->features & NETIF_F_HW_ESP) {
 		xfrm_dev_state_flush(dev_net(dev), dev, true);
+		xfrm_dev_policy_flush(dev_net(dev), dev, true);
+	}
 
 	return NOTIFY_DONE;
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 9b9e2765363d..8b8760907563 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -425,6 +425,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
 		BUG();
 
+	xfrm_dev_policy_free(policy);
 	call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
 }
 EXPORT_SYMBOL(xfrm_policy_destroy);
@@ -1769,12 +1770,41 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
 	}
 	return err;
 }
+
+static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
+						     struct net_device *dev,
+						     bool task_valid)
+{
+	struct xfrm_policy *pol;
+	int err = 0;
+
+	list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+		if (pol->walk.dead ||
+		    xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
+		    pol->xdo.dev != dev)
+			continue;
+
+		err = security_xfrm_policy_delete(pol->security);
+		if (err) {
+			xfrm_audit_policy_delete(pol, 0, task_valid);
+			return err;
+		}
+	}
+	return err;
+}
 #else
 static inline int
 xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
 {
 	return 0;
 }
+
+static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
+						     struct net_device *dev,
+						     bool task_valid)
+{
+	return 0;
+}
 #endif
 
 int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
@@ -1814,6 +1844,44 @@ out:
 }
 EXPORT_SYMBOL(xfrm_policy_flush);
 
+int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
+			  bool task_valid)
+{
+	int dir, err = 0, cnt = 0;
+	struct xfrm_policy *pol;
+
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+
+	err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid);
+	if (err)
+		goto out;
+
+again:
+	list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+		dir = xfrm_policy_id2dir(pol->index);
+		if (pol->walk.dead ||
+		    dir >= XFRM_POLICY_MAX ||
+		    pol->xdo.dev != dev)
+			continue;
+
+		__xfrm_policy_unlink(pol, dir);
+		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+		cnt++;
+		xfrm_audit_policy_delete(pol, 1, task_valid);
+		xfrm_policy_kill(pol);
+		spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+		goto again;
+	}
+	if (cnt)
+		__xfrm_policy_inexact_flush(net);
+	else
+		err = -ESRCH;
+out:
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_dev_policy_flush);
+
 int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
 		     int (*func)(struct xfrm_policy *, int, int, void*),
 		     void *data)
@@ -2245,6 +2313,7 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	if (pol) {
+		xfrm_dev_policy_delete(pol);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index c3b8c1532718..cf5172d4ce68 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1892,6 +1892,15 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net,
 	if (attrs[XFRMA_IF_ID])
 		xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
 
+	/* configure the hardware if offload is requested */
+	if (attrs[XFRMA_OFFLOAD_DEV]) {
+		err = xfrm_dev_policy_add(net, xp,
+					  nla_data(attrs[XFRMA_OFFLOAD_DEV]),
+					  p->dir, extack);
+		if (err)
+			goto error;
+	}
+
 	return xp;
  error:
 	*errp = err;
@@ -1931,6 +1940,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	xfrm_audit_policy_add(xp, err ? 0 : 1, true);
 
 	if (err) {
+		xfrm_dev_policy_delete(xp);
 		security_xfrm_policy_free(xp->security);
 		kfree(xp);
 		return err;
@@ -2043,6 +2053,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
 		err = xfrm_mark_put(skb, &xp->mark);
 	if (!err)
 		err = xfrm_if_id_put(skb, xp->if_id);
+	if (!err && xp->xdo.dev)
+		err = copy_user_offload(&xp->xdo, skb);
 	if (err) {
 		nlmsg_cancel(skb, nlh);
 		return err;
@@ -3381,6 +3393,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
 		err = xfrm_mark_put(skb, &xp->mark);
 	if (!err)
 		err = xfrm_if_id_put(skb, xp->if_id);
+	if (!err && xp->xdo.dev)
+		err = copy_user_offload(&xp->xdo, skb);
 	if (err) {
 		nlmsg_cancel(skb, nlh);
 		return err;
@@ -3499,6 +3513,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
 		err = xfrm_mark_put(skb, &xp->mark);
 	if (!err)
 		err = xfrm_if_id_put(skb, xp->if_id);
+	if (!err && xp->xdo.dev)
+		err = copy_user_offload(&xp->xdo, skb);
 	if (err) {
 		nlmsg_cancel(skb, nlh);
 		return err;
@@ -3582,6 +3598,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e
 		err = xfrm_mark_put(skb, &xp->mark);
 	if (!err)
 		err = xfrm_if_id_put(skb, xp->if_id);
+	if (!err && xp->xdo.dev)
+		err = copy_user_offload(&xp->xdo, skb);
 	if (err)
 		goto out_free_skb;
 
-- 
cgit v1.2.3


From f3da86dc2c8c9004445cfbb15ac086773622d853 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Fri, 2 Dec 2022 20:41:33 +0200
Subject: xfrm: add support to HW update soft and hard limits

Both in RX and TX, the traffic that performs IPsec packet offload
transformation is accounted by HW. It is needed to properly handle
hard limits that require to drop the packet.

It means that XFRM core needs to update internal counters with the one
that accounted by the HW, so new callbacks are introduced in this patch.

In case of soft or hard limit is occurred, the driver should call to
xfrm_state_check_expire() that will perform key rekeying exactly as
done by XFRM core.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h |  1 +
 include/net/xfrm.h        | 17 +++++++++++++++++
 net/xfrm/xfrm_state.c     |  4 ++++
 3 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4096e3fe8e4a..29ae964e3b89 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1040,6 +1040,7 @@ struct xfrmdev_ops {
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
 	void	(*xdo_dev_state_advance_esn) (struct xfrm_state *x);
+	void	(*xdo_dev_state_update_curlft) (struct xfrm_state *x);
 	int	(*xdo_dev_policy_add) (struct xfrm_policy *x);
 	void	(*xdo_dev_policy_delete) (struct xfrm_policy *x);
 	void	(*xdo_dev_policy_free) (struct xfrm_policy *x);
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index b6ee14991a32..5413cdd5ad62 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1571,6 +1571,23 @@ struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
 struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
 					      unsigned short family);
 int xfrm_state_check_expire(struct xfrm_state *x);
+#ifdef CONFIG_XFRM_OFFLOAD
+static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x)
+{
+	struct xfrm_dev_offload *xdo = &x->xso;
+	struct net_device *dev = xdo->dev;
+
+	if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
+		return;
+
+	if (dev && dev->xfrmdev_ops &&
+	    dev->xfrmdev_ops->xdo_dev_state_update_curlft)
+		dev->xfrmdev_ops->xdo_dev_state_update_curlft(x);
+
+}
+#else
+static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) {}
+#endif
 void xfrm_state_insert(struct xfrm_state *x);
 int xfrm_state_add(struct xfrm_state *x);
 int xfrm_state_update(struct xfrm_state *x);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 2a190e85da80..cc1d0ea42672 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -570,6 +570,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
 	int err = 0;
 
 	spin_lock(&x->lock);
+	xfrm_dev_state_update_curlft(x);
+
 	if (x->km.state == XFRM_STATE_DEAD)
 		goto out;
 	if (x->km.state == XFRM_STATE_EXPIRED)
@@ -1936,6 +1938,8 @@ EXPORT_SYMBOL(xfrm_state_update);
 
 int xfrm_state_check_expire(struct xfrm_state *x)
 {
+	xfrm_dev_state_update_curlft(x);
+
 	if (!x->curlft.use_time)
 		x->curlft.use_time = ktime_get_real_seconds();
 
-- 
cgit v1.2.3


From a46e9010124256f5bf5fc2c241a45cf1944b768e Mon Sep 17 00:00:00 2001
From: Revanth Kumar Uppala <ruppala@nvidia.com>
Date: Thu, 1 Dec 2022 15:58:43 +0000
Subject: net: stmmac: Power up SERDES after the PHY link

The Tegra MGBE ethernet controller requires that the SERDES link is
powered-up after the PHY link is up, otherwise the link fails to
become ready following a resume from suspend. Add a variable to indicate
that the SERDES link must be powered-up after the PHY link.

Signed-off-by: Revanth Kumar Uppala <ruppala@nvidia.com>
Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 +++++--
 include/linux/stmmac.h                            | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 18c7ca29da2c..f36590d0c830 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -988,6 +988,9 @@ static void stmmac_mac_link_up(struct phylink_config *config,
 	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
 	u32 old_ctrl, ctrl;
 
+	if (priv->plat->serdes_up_after_phy_linkup && priv->plat->serdes_powerup)
+		priv->plat->serdes_powerup(priv->dev, priv->plat->bsp_priv);
+
 	old_ctrl = readl(priv->ioaddr + MAC_CTRL_REG);
 	ctrl = old_ctrl & ~priv->hw->link.speed_mask;
 
@@ -3809,7 +3812,7 @@ static int __stmmac_open(struct net_device *dev,
 
 	stmmac_reset_queues_param(priv);
 
-	if (priv->plat->serdes_powerup) {
+	if (!priv->plat->serdes_up_after_phy_linkup && priv->plat->serdes_powerup) {
 		ret = priv->plat->serdes_powerup(dev, priv->plat->bsp_priv);
 		if (ret < 0) {
 			netdev_err(priv->dev, "%s: Serdes powerup failed\n",
@@ -7518,7 +7521,7 @@ int stmmac_resume(struct device *dev)
 			stmmac_mdio_reset(priv->mii);
 	}
 
-	if (priv->plat->serdes_powerup) {
+	if (!priv->plat->serdes_up_after_phy_linkup && priv->plat->serdes_powerup) {
 		ret = priv->plat->serdes_powerup(ndev,
 						 priv->plat->bsp_priv);
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index fb2e88614f5d..83ca2e8eb6b5 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -271,5 +271,6 @@ struct plat_stmmacenet_data {
 	int msi_tx_base_vec;
 	bool use_phy_wol;
 	bool sph_disable;
+	bool serdes_up_after_phy_linkup;
 };
 #endif
-- 
cgit v1.2.3


From 3afee4ed336ed7a6cff78d23339879ffb654e02e Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Fri, 2 Dec 2022 22:10:23 +0200
Subject: net/mlx5: Add HW definitions for IPsec packet offload

Add all needed bits to support IPsec packet offload mode.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h |  1 +
 include/linux/mlx5/mlx5_ifc.h                     | 53 +++++++++++++++++++++--
 2 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h
index 4312614bf3bc..c8fc3c838642 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h
@@ -71,6 +71,7 @@ enum {
 };
 
 enum {
+	MLX5_ACCESS_ASO_OPC_MOD_IPSEC = 0x0,
 	MLX5_ACCESS_ASO_OPC_MOD_FLOW_METER = 0x2,
 	MLX5_ACCESS_ASO_OPC_MOD_MACSEC = 0x5,
 };
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 5a4e914e2a6f..300b56ea5ff4 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -445,7 +445,10 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         max_modify_header_actions[0x8];
 	u8         max_ft_level[0x8];
 
-	u8         reserved_at_40[0x6];
+	u8         reformat_add_esp_trasport[0x1];
+	u8         reserved_at_41[0x2];
+	u8         reformat_del_esp_trasport[0x1];
+	u8         reserved_at_44[0x2];
 	u8         execute_aso[0x1];
 	u8         reserved_at_47[0x19];
 
@@ -638,8 +641,10 @@ struct mlx5_ifc_fte_match_set_misc2_bits {
 	u8         reserved_at_1a0[0x8];
 
 	u8         macsec_syndrome[0x8];
+	u8         ipsec_syndrome[0x8];
+	u8         reserved_at_1b8[0x8];
 
-	u8         reserved_at_1b0[0x50];
+	u8         reserved_at_1c0[0x40];
 };
 
 struct mlx5_ifc_fte_match_set_misc3_bits {
@@ -6384,6 +6389,9 @@ enum mlx5_reformat_ctx_type {
 	MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x2,
 	MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3,
 	MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4,
+	MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV4 = 0x5,
+	MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT = 0x8,
+	MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb,
 	MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf,
 	MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10,
 	MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11,
@@ -11563,6 +11571,41 @@ enum {
 	MLX5_IPSEC_OBJECT_ICV_LEN_16B,
 };
 
+enum {
+	MLX5_IPSEC_ASO_REG_C_0_1 = 0x0,
+	MLX5_IPSEC_ASO_REG_C_2_3 = 0x1,
+	MLX5_IPSEC_ASO_REG_C_4_5 = 0x2,
+	MLX5_IPSEC_ASO_REG_C_6_7 = 0x3,
+};
+
+enum {
+	MLX5_IPSEC_ASO_MODE              = 0x0,
+	MLX5_IPSEC_ASO_REPLAY_PROTECTION = 0x1,
+	MLX5_IPSEC_ASO_INC_SN            = 0x2,
+};
+
+struct mlx5_ifc_ipsec_aso_bits {
+	u8         valid[0x1];
+	u8         reserved_at_201[0x1];
+	u8         mode[0x2];
+	u8         window_sz[0x2];
+	u8         soft_lft_arm[0x1];
+	u8         hard_lft_arm[0x1];
+	u8         remove_flow_enable[0x1];
+	u8         esn_event_arm[0x1];
+	u8         reserved_at_20a[0x16];
+
+	u8         remove_flow_pkt_cnt[0x20];
+
+	u8         remove_flow_soft_lft[0x20];
+
+	u8         reserved_at_260[0x80];
+
+	u8         mode_parameter[0x20];
+
+	u8         replay_protection_window[0x100];
+};
+
 struct mlx5_ifc_ipsec_obj_bits {
 	u8         modify_field_select[0x40];
 	u8         full_offload[0x1];
@@ -11584,7 +11627,11 @@ struct mlx5_ifc_ipsec_obj_bits {
 
 	u8         implicit_iv[0x40];
 
-	u8         reserved_at_100[0x700];
+	u8         reserved_at_100[0x8];
+	u8         ipsec_aso_access_pd[0x18];
+	u8         reserved_at_120[0xe0];
+
+	struct mlx5_ifc_ipsec_aso_bits ipsec_aso;
 };
 
 struct mlx5_ifc_create_ipsec_obj_in_bits {
-- 
cgit v1.2.3


From bffdeaa8a5af7200b0e74c9d5a41167f86626a36 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 6 Dec 2022 15:33:43 -0800
Subject: bpf: decouple prune and jump points

BPF verifier marks some instructions as prune points. Currently these
prune points serve two purposes.

It's a point where verifier tries to find previously verified state and
check current state's equivalence to short circuit verification for
current code path.

But also currently it's a point where jump history, used for precision
backtracking, is updated. This is done so that non-linear flow of
execution could be properly backtracked.

Such coupling is coincidental and unnecessary. Some prune points are not
part of some non-linear jump path, so don't need update of jump history.
On the other hand, not all instructions which have to be recorded in
jump history necessarily are good prune points.

This patch splits prune and jump points into independent flags.
Currently all prune points are marked as jump points to minimize amount
of changes in this patch, but next patch will perform some optimization
of prune vs jmp point placement.

No functional changes are intended.

Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20221206233345.438540-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 57 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 44 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c07b351a5bc7..70d06a99f0b8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -452,6 +452,7 @@ struct bpf_insn_aux_data {
 	/* below fields are initialized once */
 	unsigned int orig_idx; /* original instruction index */
 	bool prune_point;
+	bool jmp_point;
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 67a13110bc22..cf580e4c00f5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2530,6 +2530,16 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	return 0;
 }
 
+static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
+{
+	env->insn_aux_data[idx].jmp_point = true;
+}
+
+static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].jmp_point;
+}
+
 /* for any branch, call, exit record the history of jmps in the given state */
 static int push_jmp_history(struct bpf_verifier_env *env,
 			    struct bpf_verifier_state *cur)
@@ -2538,6 +2548,9 @@ static int push_jmp_history(struct bpf_verifier_env *env,
 	struct bpf_idx_pair *p;
 	size_t alloc_size;
 
+	if (!is_jmp_point(env, env->insn_idx))
+		return 0;
+
 	cnt++;
 	alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
 	p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
@@ -12140,11 +12153,16 @@ static struct bpf_verifier_state_list **explored_state(
 	return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
 }
 
-static void init_explored_state(struct bpf_verifier_env *env, int idx)
+static void mark_prune_point(struct bpf_verifier_env *env, int idx)
 {
 	env->insn_aux_data[idx].prune_point = true;
 }
 
+static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].prune_point;
+}
+
 enum {
 	DONE_EXPLORING = 0,
 	KEEP_EXPLORING = 1,
@@ -12173,9 +12191,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (e == BRANCH)
+	if (e == BRANCH) {
 		/* mark branch target for state pruning */
-		init_explored_state(env, w);
+		mark_prune_point(env, w);
+		mark_jmp_point(env, w);
+	}
 
 	if (insn_state[w] == 0) {
 		/* tree-edge */
@@ -12213,10 +12233,13 @@ static int visit_func_call_insn(int t, int insn_cnt,
 	if (ret)
 		return ret;
 
-	if (t + 1 < insn_cnt)
-		init_explored_state(env, t + 1);
+	if (t + 1 < insn_cnt) {
+		mark_prune_point(env, t + 1);
+		mark_jmp_point(env, t + 1);
+	}
 	if (visit_callee) {
-		init_explored_state(env, t);
+		mark_prune_point(env, t);
+		mark_jmp_point(env, t);
 		ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
 				/* It's ok to allow recursion from CFG point of
 				 * view. __check_func_call() will do the actual
@@ -12250,13 +12273,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
 		return DONE_EXPLORING;
 
 	case BPF_CALL:
-		if (insns[t].imm == BPF_FUNC_timer_set_callback)
+		if (insns[t].imm == BPF_FUNC_timer_set_callback) {
 			/* Mark this call insn to trigger is_state_visited() check
 			 * before call itself is processed by __check_func_call().
 			 * Otherwise new async state will be pushed for further
 			 * exploration.
 			 */
-			init_explored_state(env, t);
+			mark_prune_point(env, t);
+			mark_jmp_point(env, t);
+		}
 		return visit_func_call_insn(t, insn_cnt, insns, env,
 					    insns[t].src_reg == BPF_PSEUDO_CALL);
 
@@ -12274,18 +12299,22 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
 		 * but it's marked, since backtracking needs
 		 * to record jmp history in is_state_visited().
 		 */
-		init_explored_state(env, t + insns[t].off + 1);
+		mark_prune_point(env, t + insns[t].off + 1);
+		mark_jmp_point(env, t + insns[t].off + 1);
 		/* tell verifier to check for equivalent states
 		 * after every call and jump
 		 */
-		if (t + 1 < insn_cnt)
-			init_explored_state(env, t + 1);
+		if (t + 1 < insn_cnt) {
+			mark_prune_point(env, t + 1);
+			mark_jmp_point(env, t + 1);
+		}
 
 		return ret;
 
 	default:
 		/* conditional jump with two edges */
-		init_explored_state(env, t);
+		mark_prune_point(env, t);
+		mark_jmp_point(env, t);
 		ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
 		if (ret)
 			return ret;
@@ -13320,11 +13349,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	bool add_new_state = env->test_state_freq ? true : false;
 
 	cur->last_insn_idx = env->prev_insn_idx;
-	if (!env->insn_aux_data[insn_idx].prune_point)
+	if (!is_prune_point(env, insn_idx))
 		/* this 'insn_idx' instruction wasn't marked, so we will not
 		 * be doing state search here
 		 */
-		return 0;
+		return push_jmp_history(env, cur);
 
 	/* bpf progs typically have pruning point every 4 instructions
 	 * http://vger.kernel.org/bpfconf2019.html#session-1
-- 
cgit v1.2.3


From ed883bec679b027b198d57a336715f8298fb88b4 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 5 Dec 2022 12:34:42 +0100
Subject: net: ethernet: mtk_wed: add reset to rx_ring_setup callback

This patch adds reset parameter to mtk_wed_rx_ring_setup signature
in order to align rx_ring_setup callback to tx_ring_setup one introduced
in 'commit 23dca7a90017 ("net: ethernet: mtk_wed: add reset to
tx_ring_setup callback")'

Co-developed-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Link: https://lore.kernel.org/r/29c6e7a5469e784406cf3e2920351d1207713d05.1670239984.git.lorenzo@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mediatek/mtk_wed.c  | 20 +++++++++++++-------
 drivers/net/wireless/mediatek/mt76/dma.c |  2 +-
 include/linux/soc/mediatek/mtk_wed.h     |  8 ++++----
 3 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index d041615b2bac..06b6cc53fa02 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -1210,7 +1210,8 @@ mtk_wed_wdma_rx_ring_setup(struct mtk_wed_device *dev, int idx, int size,
 }
 
 static int
-mtk_wed_wdma_tx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
+mtk_wed_wdma_tx_ring_setup(struct mtk_wed_device *dev, int idx, int size,
+			   bool reset)
 {
 	u32 desc_size = sizeof(struct mtk_wdma_desc) * dev->hw->version;
 	struct mtk_wed_ring *wdma;
@@ -1219,8 +1220,8 @@ mtk_wed_wdma_tx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
 		return -EINVAL;
 
 	wdma = &dev->tx_wdma[idx];
-	if (mtk_wed_ring_alloc(dev, wdma, MTK_WED_WDMA_RING_SIZE, desc_size,
-			       true))
+	if (!reset && mtk_wed_ring_alloc(dev, wdma, MTK_WED_WDMA_RING_SIZE,
+					 desc_size, true))
 		return -ENOMEM;
 
 	wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_BASE,
@@ -1230,6 +1231,9 @@ mtk_wed_wdma_tx_ring_setup(struct mtk_wed_device *dev, int idx, int size)
 	wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_CPU_IDX, 0);
 	wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_DMA_IDX, 0);
 
+	if (reset)
+		mtk_wed_ring_reset(wdma, MTK_WED_WDMA_RING_SIZE, true);
+
 	if (!idx)  {
 		wed_w32(dev, MTK_WED_WDMA_RING_TX + MTK_WED_RING_OFS_BASE,
 			wdma->desc_phys);
@@ -1569,18 +1573,20 @@ mtk_wed_txfree_ring_setup(struct mtk_wed_device *dev, void __iomem *regs)
 }
 
 static int
-mtk_wed_rx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs)
+mtk_wed_rx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs,
+		      bool reset)
 {
 	struct mtk_wed_ring *ring = &dev->rx_ring[idx];
 
 	if (WARN_ON(idx >= ARRAY_SIZE(dev->rx_ring)))
 		return -EINVAL;
 
-	if (mtk_wed_ring_alloc(dev, ring, MTK_WED_RX_RING_SIZE,
-			       sizeof(*ring->desc), false))
+	if (!reset && mtk_wed_ring_alloc(dev, ring, MTK_WED_RX_RING_SIZE,
+					 sizeof(*ring->desc), false))
 		return -ENOMEM;
 
-	if (mtk_wed_wdma_tx_ring_setup(dev, idx, MTK_WED_WDMA_RING_SIZE))
+	if (mtk_wed_wdma_tx_ring_setup(dev, idx, MTK_WED_WDMA_RING_SIZE,
+				       reset))
 		return -ENOMEM;
 
 	ring->reg_base = MTK_WED_RING_RX_DATA(idx);
diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index 3f8c0845fcca..f795548562f5 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -648,7 +648,7 @@ mt76_dma_wed_setup(struct mt76_dev *dev, struct mt76_queue *q)
 			q->wed_regs = wed->txfree_ring.reg_base;
 		break;
 	case MT76_WED_Q_RX:
-		ret = mtk_wed_device_rx_ring_setup(wed, ring, q->regs);
+		ret = mtk_wed_device_rx_ring_setup(wed, ring, q->regs, false);
 		if (!ret)
 			q->wed_regs = wed->rx_ring[ring].reg_base;
 		break;
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index beb190449704..a0746d4aec20 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -160,7 +160,7 @@ struct mtk_wed_ops {
 	int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring,
 			     void __iomem *regs, bool reset);
 	int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring,
-			     void __iomem *regs);
+			     void __iomem *regs, bool reset);
 	int (*txfree_ring_setup)(struct mtk_wed_device *dev,
 				 void __iomem *regs);
 	int (*msg_update)(struct mtk_wed_device *dev, int cmd_id,
@@ -228,8 +228,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
 	(_dev)->ops->irq_get(_dev, _mask)
 #define mtk_wed_device_irq_set_mask(_dev, _mask) \
 	(_dev)->ops->irq_set_mask(_dev, _mask)
-#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) \
-	(_dev)->ops->rx_ring_setup(_dev, _ring, _regs)
+#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs, _reset) \
+	(_dev)->ops->rx_ring_setup(_dev, _ring, _regs, _reset)
 #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) \
 	(_dev)->ops->ppe_check(_dev, _skb, _reason, _hash)
 #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \
@@ -249,7 +249,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0)
 #define mtk_wed_device_irq_get(_dev, _mask) 0
 #define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0)
-#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV
+#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs, _reset) -ENODEV
 #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash)  do {} while (0)
 #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV
 #define mtk_wed_device_stop(_dev) do {} while (0)
-- 
cgit v1.2.3


From 5b481acab4ce017fda8166fa9428511da41109e5 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Dec 2022 15:59:32 +0100
Subject: bpf: do not rely on ALLOW_ERROR_INJECTION for fmod_ret

The current way of expressing that a non-bpf kernel component is willing
to accept that bpf programs can be attached to it and that they can change
the return value is to abuse ALLOW_ERROR_INJECTION.
This is debated in the link below, and the result is that it is not a
reasonable thing to do.

Reuse the kfunc declaration structure to also tag the kernel functions
we want to be fmodret. This way we can control from any subsystem which
functions are being modified by bpf without touching the verifier.

Link: https://lore.kernel.org/all/20221121104403.1545f9b5@gandalf.local.home/
Suggested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/r/20221206145936.922196-2-benjamin.tissoires@redhat.com
---
 include/linux/btf.h   |  2 ++
 kernel/bpf/btf.c      | 30 +++++++++++++++++++++++++-----
 kernel/bpf/verifier.c | 17 +++++++++++++++--
 net/bpf/test_run.c    | 14 +++++++++++---
 4 files changed, 53 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index f9aababc5d78..6a0808e7845c 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -412,8 +412,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog);
 u32 *btf_kfunc_id_set_contains(const struct btf *btf,
 			       enum bpf_prog_type prog_type,
 			       u32 kfunc_btf_id);
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id);
 int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 			      const struct btf_kfunc_id_set *s);
+int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset);
 s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
 int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
 				struct module *owner);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 35c07afac924..c1df506293e4 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -204,6 +204,7 @@ enum btf_kfunc_hook {
 	BTF_KFUNC_HOOK_STRUCT_OPS,
 	BTF_KFUNC_HOOK_TRACING,
 	BTF_KFUNC_HOOK_SYSCALL,
+	BTF_KFUNC_HOOK_FMODRET,
 	BTF_KFUNC_HOOK_MAX,
 };
 
@@ -7448,11 +7449,14 @@ u32 *btf_kfunc_id_set_contains(const struct btf *btf,
 	return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
 }
 
-/* This function must be invoked only from initcalls/module init functions */
-int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
-			      const struct btf_kfunc_id_set *kset)
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id)
+{
+	return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
+}
+
+static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
+				       const struct btf_kfunc_id_set *kset)
 {
-	enum btf_kfunc_hook hook;
 	struct btf *btf;
 	int ret;
 
@@ -7471,13 +7475,29 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
-	hook = bpf_prog_type_to_kfunc_hook(prog_type);
 	ret = btf_populate_kfunc_set(btf, hook, kset->set);
 	btf_put(btf);
 	return ret;
 }
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+			      const struct btf_kfunc_id_set *kset)
+{
+	enum btf_kfunc_hook hook;
+
+	hook = bpf_prog_type_to_kfunc_hook(prog_type);
+	return __register_btf_kfunc_id_set(hook, kset);
+}
 EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
 
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset)
+{
+	return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set);
+
 s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
 {
 	struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 225666307bba..ec2e6ec7309e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15021,12 +15021,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			ret = -EINVAL;
 			switch (prog->type) {
 			case BPF_PROG_TYPE_TRACING:
-				/* fentry/fexit/fmod_ret progs can be sleepable only if they are
+
+				/* fentry/fexit/fmod_ret progs can be sleepable if they are
 				 * attached to ALLOW_ERROR_INJECTION and are not in denylist.
 				 */
 				if (!check_non_sleepable_error_inject(btf_id) &&
 				    within_error_injection_list(addr))
 					ret = 0;
+				/* fentry/fexit/fmod_ret progs can also be sleepable if they are
+				 * in the fmodret id set with the KF_SLEEPABLE flag.
+				 */
+				else {
+					u32 *flags = btf_kfunc_is_modify_return(btf, btf_id);
+
+					if (flags && (*flags & KF_SLEEPABLE))
+						ret = 0;
+				}
 				break;
 			case BPF_PROG_TYPE_LSM:
 				/* LSM progs check that they are attached to bpf_lsm_*() funcs.
@@ -15047,7 +15057,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 				bpf_log(log, "can't modify return codes of BPF programs\n");
 				return -EINVAL;
 			}
-			ret = check_attach_modify_return(addr, tname);
+			ret = -EINVAL;
+			if (btf_kfunc_is_modify_return(btf, btf_id) ||
+			    !check_attach_modify_return(addr, tname))
+				ret = 0;
 			if (ret) {
 				bpf_log(log, "%s() is not modifiable\n", tname);
 				return ret;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 13d578ce2a09..5079fb089b5f 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -489,7 +489,6 @@ int noinline bpf_fentry_test1(int a)
 	return a + 1;
 }
 EXPORT_SYMBOL_GPL(bpf_fentry_test1);
-ALLOW_ERROR_INJECTION(bpf_fentry_test1, ERRNO);
 
 int noinline bpf_fentry_test2(int a, u64 b)
 {
@@ -733,7 +732,15 @@ noinline void bpf_kfunc_call_test_destructive(void)
 
 __diag_pop();
 
-ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
+BTF_SET8_START(bpf_test_modify_return_ids)
+BTF_ID_FLAGS(func, bpf_modify_return_test)
+BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE)
+BTF_SET8_END(bpf_test_modify_return_ids)
+
+static const struct btf_kfunc_id_set bpf_test_modify_return_set = {
+	.owner = THIS_MODULE,
+	.set   = &bpf_test_modify_return_ids,
+};
 
 BTF_SET8_START(test_sk_check_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
@@ -1668,7 +1675,8 @@ static int __init bpf_prog_test_run_init(void)
 	};
 	int ret;
 
-	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set);
+	ret = register_btf_fmodret_id_set(&bpf_test_modify_return_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set);
 	return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc,
-- 
cgit v1.2.3


From df268f6ca7da1088d7ecff4250d1478d3fa2406b Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Tue, 6 Dec 2022 20:51:12 +0200
Subject: net/mlx5: Introduce IFC bits for migratable

Introduce IFC related capabilities to enable setting VF to be able to
perform live migration. e.g.: to be migratable.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Acked-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 5a4e914e2a6f..2093131483c7 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -68,6 +68,7 @@ enum {
 	MLX5_SET_HCA_CAP_OP_MOD_ODP                   = 0x2,
 	MLX5_SET_HCA_CAP_OP_MOD_ATOMIC                = 0x3,
 	MLX5_SET_HCA_CAP_OP_MOD_ROCE                  = 0x4,
+	MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE2       = 0x20,
 	MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION       = 0x25,
 };
 
@@ -1875,7 +1876,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 };
 
 struct mlx5_ifc_cmd_hca_cap_2_bits {
-	u8	   reserved_at_0[0xa0];
+	u8	   reserved_at_0[0x80];
+
+	u8         migratable[0x1];
+	u8         reserved_at_81[0x1f];
 
 	u8	   max_reformat_insert_size[0x8];
 	u8	   max_reformat_insert_offset[0x8];
-- 
cgit v1.2.3


From 47d0c500d76c7b1d220c47cd875763fef04eba2e Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Tue, 6 Dec 2022 20:51:16 +0200
Subject: net/mlx5: Add generic getters for other functions caps

Downstream patch requires to get other function GENERAL2 caps while
mlx5_vport_get_other_func_cap() gets only one type of caps (general).
Rename it to represent this and introduce a generic implementation
of mlx5_vport_get_other_func_cap().

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Acked-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h        | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c          | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/vport.c            | 6 ++++--
 include/linux/mlx5/vport.h                                 | 2 ++
 5 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 9b6fbb19c22a..33dffcb8bdd7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3889,7 +3889,7 @@ static int mlx5_esw_query_vport_vhca_id(struct mlx5_eswitch *esw, u16 vport_num,
 	if (!query_ctx)
 		return -ENOMEM;
 
-	err = mlx5_vport_get_other_func_cap(esw->dev, vport_num, query_ctx);
+	err = mlx5_vport_get_other_func_general_cap(esw->dev, vport_num, query_ctx);
 	if (err)
 		goto out_free;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index a806e3de7b7c..09473983778f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -324,7 +324,8 @@ void mlx5_unload_one_devl_locked(struct mlx5_core_dev *dev);
 int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery);
 int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery);
 
-int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out);
+#define mlx5_vport_get_other_func_general_cap(dev, fid, out)		\
+	mlx5_vport_get_other_func_cap(dev, fid, out, MLX5_CAP_GENERAL)
 
 void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work);
 static inline u32 mlx5_sriov_get_vf_total_msix(struct pci_dev *pdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 662f1d55e30e..6bde18bcd42f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -4,6 +4,7 @@
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 #include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
 #include "mlx5_core.h"
 #include "mlx5_irq.h"
 #include "pci_irq.h"
@@ -101,7 +102,7 @@ int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id,
 		goto out;
 	}
 
-	ret = mlx5_vport_get_other_func_cap(dev, function_id, query_cap);
+	ret = mlx5_vport_get_other_func_general_cap(dev, function_id, query_cap);
 	if (ret)
 		goto out;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index d5c317325030..7eca7582f243 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -1160,14 +1160,16 @@ u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev)
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid);
 
-int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out)
+int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out,
+				  u16 opmod)
 {
-	u16 opmod = (MLX5_CAP_GENERAL << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01);
 	u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)] = {};
 
+	opmod = (opmod << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01);
 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
 	MLX5_SET(query_hca_cap_in, in, op_mod, opmod);
 	MLX5_SET(query_hca_cap_in, in, function_id, function_id);
 	MLX5_SET(query_hca_cap_in, in, other_function, true);
 	return mlx5_cmd_exec_inout(dev, query_hca_cap, in, out);
 }
+EXPORT_SYMBOL_GPL(mlx5_vport_get_other_func_cap);
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index aad53cb72f17..7f31432f44c2 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -132,4 +132,6 @@ int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev,
 int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev);
 
 u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev);
+int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out,
+				  u16 opmod);
 #endif /* __MLX5_VPORT_H__ */
-- 
cgit v1.2.3


From f1543c7abab25d93bc8e9fae79b4cb3153ed6669 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 31 May 2022 13:55:28 +0300
Subject: net/mlx5: mlx5_ifc updates for MATCH_DEFINER general object

Update full structure of match definer and add an ID of
the SELECT match definer type.

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 68 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 2093131483c7..294cfe175c4b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -6108,6 +6108,38 @@ struct mlx5_ifc_match_definer_format_32_bits {
 	u8         inner_dmac_15_0[0x10];
 };
 
+enum {
+	MLX5_IFC_DEFINER_FORMAT_ID_SELECT = 61,
+};
+
+#define MLX5_IFC_DEFINER_FORMAT_OFFSET_UNUSED 0x0
+#define MLX5_IFC_DEFINER_FORMAT_OFFSET_OUTER_ETH_PKT_LEN 0x48
+#define MLX5_IFC_DEFINER_DW_SELECTORS_NUM 9
+#define MLX5_IFC_DEFINER_BYTE_SELECTORS_NUM 8
+
+struct mlx5_ifc_match_definer_match_mask_bits {
+	u8         reserved_at_1c0[5][0x20];
+	u8         match_dw_8[0x20];
+	u8         match_dw_7[0x20];
+	u8         match_dw_6[0x20];
+	u8         match_dw_5[0x20];
+	u8         match_dw_4[0x20];
+	u8         match_dw_3[0x20];
+	u8         match_dw_2[0x20];
+	u8         match_dw_1[0x20];
+	u8         match_dw_0[0x20];
+
+	u8         match_byte_7[0x8];
+	u8         match_byte_6[0x8];
+	u8         match_byte_5[0x8];
+	u8         match_byte_4[0x8];
+
+	u8         match_byte_3[0x8];
+	u8         match_byte_2[0x8];
+	u8         match_byte_1[0x8];
+	u8         match_byte_0[0x8];
+};
+
 struct mlx5_ifc_match_definer_bits {
 	u8         modify_field_select[0x40];
 
@@ -6116,9 +6148,41 @@ struct mlx5_ifc_match_definer_bits {
 	u8         reserved_at_80[0x10];
 	u8         format_id[0x10];
 
-	u8         reserved_at_a0[0x160];
+	u8         reserved_at_a0[0x60];
 
-	u8         match_mask[16][0x20];
+	u8         format_select_dw3[0x8];
+	u8         format_select_dw2[0x8];
+	u8         format_select_dw1[0x8];
+	u8         format_select_dw0[0x8];
+
+	u8         format_select_dw7[0x8];
+	u8         format_select_dw6[0x8];
+	u8         format_select_dw5[0x8];
+	u8         format_select_dw4[0x8];
+
+	u8         reserved_at_100[0x18];
+	u8         format_select_dw8[0x8];
+
+	u8         reserved_at_120[0x20];
+
+	u8         format_select_byte3[0x8];
+	u8         format_select_byte2[0x8];
+	u8         format_select_byte1[0x8];
+	u8         format_select_byte0[0x8];
+
+	u8         format_select_byte7[0x8];
+	u8         format_select_byte6[0x8];
+	u8         format_select_byte5[0x8];
+	u8         format_select_byte4[0x8];
+
+	u8         reserved_at_180[0x40];
+
+	union {
+		struct {
+			u8         match_mask[16][0x20];
+		};
+		struct mlx5_ifc_match_definer_match_mask_bits match_mask_format;
+	};
 };
 
 struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
-- 
cgit v1.2.3


From 38bf24c38d19dc4ba5ec684ac2afa2f8e256db1e Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Mon, 15 Aug 2022 12:45:28 +0300
Subject: net/mlx5: fs, add match on ranges API

Range is a new flow destination type which allows matching on
a range of values instead of matching on a specific value.

Range flow destination has the following fields:
 - hit_ft: flow table to forward the traffic in case of hit
 - miss_ft: flow table to forward the traffic in case of miss
 - field: which packet characteristic to match on
 - min: minimal value for the selected field
 - max: maximal value for the selected field

Note:
 - In order to match, the value in the packet should meet
   the following criteria: min <= value < max
 - Currently, the only supported field type is L2 packet length

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c | 16 ++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c        | 11 +++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h        |  1 +
 include/linux/mlx5/fs.h                                  | 12 ++++++++++++
 4 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
index c5bb79a4fa57..2732128e7a6e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
@@ -228,6 +228,17 @@ const char *parse_fs_hdrs(struct trace_seq *p,
 	return ret;
 }
 
+static const char
+*fs_dest_range_field_to_str(enum mlx5_flow_dest_range_field field)
+{
+	switch (field) {
+	case MLX5_FLOW_DEST_RANGE_FIELD_PKT_LEN:
+		return "packet len";
+	default:
+		return "unknown dest range field";
+	}
+}
+
 const char *parse_fs_dst(struct trace_seq *p,
 			 const struct mlx5_flow_destination *dst,
 			 u32 counter_id)
@@ -259,6 +270,11 @@ const char *parse_fs_dst(struct trace_seq *p,
 	case MLX5_FLOW_DESTINATION_TYPE_PORT:
 		trace_seq_printf(p, "port\n");
 		break;
+	case MLX5_FLOW_DESTINATION_TYPE_RANGE:
+		trace_seq_printf(p, "field=%s min=%d max=%d\n",
+				 fs_dest_range_field_to_str(dst->range.field),
+				 dst->range.min, dst->range.max);
+		break;
 	case MLX5_FLOW_DESTINATION_TYPE_NONE:
 		trace_seq_printf(p, "none\n");
 		break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index d53190f22871..4dcd26b86662 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -448,7 +448,8 @@ static bool is_fwd_dest_type(enum mlx5_flow_destination_type type)
 		type == MLX5_FLOW_DESTINATION_TYPE_UPLINK ||
 		type == MLX5_FLOW_DESTINATION_TYPE_VPORT ||
 		type == MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER ||
-		type == MLX5_FLOW_DESTINATION_TYPE_TIR;
+		type == MLX5_FLOW_DESTINATION_TYPE_TIR ||
+		type == MLX5_FLOW_DESTINATION_TYPE_RANGE;
 }
 
 static bool check_valid_spec(const struct mlx5_flow_spec *spec)
@@ -1578,7 +1579,13 @@ static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
 		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM &&
 		     d1->ft_num == d2->ft_num) ||
 		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER &&
-		     d1->sampler_id == d2->sampler_id))
+		     d1->sampler_id == d2->sampler_id) ||
+		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_RANGE &&
+		     d1->range.field == d2->range.field &&
+		     d1->range.hit_ft == d2->range.hit_ft &&
+		     d1->range.miss_ft == d2->range.miss_ft &&
+		     d1->range.min == d2->range.min &&
+		     d1->range.max == d2->range.max))
 			return true;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 3af50fd04d28..f137a0611b77 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -123,6 +123,7 @@ enum mlx5_flow_steering_mode {
 enum mlx5_flow_steering_capabilty {
 	MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX = 1UL << 0,
 	MLX5_FLOW_STEERING_CAP_VLAN_POP_ON_TX = 1UL << 1,
+	MLX5_FLOW_STEERING_CAP_MATCH_RANGES = 1UL << 2,
 };
 
 struct mlx5_flow_steering {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index c7a91981cd5a..ba6958b49a8e 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -50,6 +50,7 @@ enum mlx5_flow_destination_type {
 	MLX5_FLOW_DESTINATION_TYPE_PORT,
 	MLX5_FLOW_DESTINATION_TYPE_COUNTER,
 	MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM,
+	MLX5_FLOW_DESTINATION_TYPE_RANGE,
 };
 
 enum {
@@ -143,6 +144,10 @@ enum {
 	MLX5_FLOW_DEST_VPORT_REFORMAT_ID  = BIT(1),
 };
 
+enum mlx5_flow_dest_range_field {
+	MLX5_FLOW_DEST_RANGE_FIELD_PKT_LEN = 0,
+};
+
 struct mlx5_flow_destination {
 	enum mlx5_flow_destination_type	type;
 	union {
@@ -156,6 +161,13 @@ struct mlx5_flow_destination {
 			struct mlx5_pkt_reformat *pkt_reformat;
 			u8		flags;
 		} vport;
+		struct {
+			struct mlx5_flow_table         *hit_ft;
+			struct mlx5_flow_table         *miss_ft;
+			enum mlx5_flow_dest_range_field field;
+			u32                             min;
+			u32                             max;
+		} range;
 		u32			sampler_id;
 	};
 };
-- 
cgit v1.2.3


From 6b75bd3d036745b9be30917909f03602099adbdb Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 8 Dec 2022 02:11:35 +0530
Subject: bpf: Refactor ARG_PTR_TO_DYNPTR checks into process_dynptr_func

ARG_PTR_TO_DYNPTR is akin to ARG_PTR_TO_TIMER, ARG_PTR_TO_KPTR, where
the underlying register type is subjected to more special checks to
determine the type of object represented by the pointer and its state
consistency.

Move dynptr checks to their own 'process_dynptr_func' function so that
is consistent and in-line with existing code. This also makes it easier
to reuse this code for kfunc handling.

Then, reuse this consolidated function in kfunc dynptr handling too.
Note that for kfuncs, the arg_type constraint of DYNPTR_TYPE_LOCAL has
been lifted.

Acked-by: David Vernet <void@manifault.com>
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221207204141.308952-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       |   8 +-
 kernel/bpf/verifier.c                              | 134 +++++++++++----------
 .../selftests/bpf/prog_tests/kfunc_dynptr_param.c  |   7 +-
 .../selftests/bpf/progs/test_kfunc_dynptr_param.c  |  12 --
 4 files changed, 75 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 70d06a99f0b8..df0cb825e0e3 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -615,11 +615,9 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 			   enum bpf_arg_type arg_type);
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size);
-bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
-			      struct bpf_reg_state *reg);
-bool is_dynptr_type_expected(struct bpf_verifier_env *env,
-			     struct bpf_reg_state *reg,
-			     enum bpf_arg_type arg_type);
+struct bpf_call_arg_meta;
+int process_dynptr_func(struct bpf_verifier_env *env, int regno,
+			enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta);
 
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3194e9d9e4e4..fcd8a71035aa 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -810,8 +810,7 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
 	return true;
 }
 
-bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
-			      struct bpf_reg_state *reg)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
 	int spi = get_spi(reg->off);
@@ -830,9 +829,8 @@ bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
 	return true;
 }
 
-bool is_dynptr_type_expected(struct bpf_verifier_env *env,
-			     struct bpf_reg_state *reg,
-			     enum bpf_arg_type arg_type)
+static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				    enum bpf_arg_type arg_type)
 {
 	struct bpf_func_state *state = func(env, reg);
 	enum bpf_dynptr_type dynptr_type;
@@ -5859,6 +5857,65 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+int process_dynptr_func(struct bpf_verifier_env *env, int regno,
+			enum bpf_arg_type arg_type,
+			struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+
+	/* We only need to check for initialized / uninitialized helper
+	 * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
+	 * assumption is that if it is, that a helper function
+	 * initialized the dynptr on behalf of the BPF program.
+	 */
+	if (base_type(reg->type) == PTR_TO_DYNPTR)
+		return 0;
+	if (arg_type & MEM_UNINIT) {
+		if (!is_dynptr_reg_valid_uninit(env, reg)) {
+			verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+			return -EINVAL;
+		}
+
+		/* We only support one dynptr being uninitialized at the moment,
+		 * which is sufficient for the helper functions we have right now.
+		 */
+		if (meta->uninit_dynptr_regno) {
+			verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
+			return -EFAULT;
+		}
+
+		meta->uninit_dynptr_regno = regno;
+	} else {
+		if (!is_dynptr_reg_valid_init(env, reg)) {
+			verbose(env,
+				"Expected an initialized dynptr as arg #%d\n",
+				regno);
+			return -EINVAL;
+		}
+
+		if (!is_dynptr_type_expected(env, reg, arg_type)) {
+			const char *err_extra = "";
+
+			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+			case DYNPTR_TYPE_LOCAL:
+				err_extra = "local";
+				break;
+			case DYNPTR_TYPE_RINGBUF:
+				err_extra = "ringbuf";
+				break;
+			default:
+				err_extra = "<unknown>";
+				break;
+			}
+			verbose(env,
+				"Expected a dynptr of type %s as arg #%d\n",
+				err_extra, regno);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
 static bool arg_type_is_mem_size(enum bpf_arg_type type)
 {
 	return type == ARG_CONST_SIZE ||
@@ -6390,52 +6447,8 @@ skip_type_check:
 		err = check_mem_size_reg(env, reg, regno, true, meta);
 		break;
 	case ARG_PTR_TO_DYNPTR:
-		/* We only need to check for initialized / uninitialized helper
-		 * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
-		 * assumption is that if it is, that a helper function
-		 * initialized the dynptr on behalf of the BPF program.
-		 */
-		if (base_type(reg->type) == PTR_TO_DYNPTR)
-			break;
-		if (arg_type & MEM_UNINIT) {
-			if (!is_dynptr_reg_valid_uninit(env, reg)) {
-				verbose(env, "Dynptr has to be an uninitialized dynptr\n");
-				return -EINVAL;
-			}
-
-			/* We only support one dynptr being uninitialized at the moment,
-			 * which is sufficient for the helper functions we have right now.
-			 */
-			if (meta->uninit_dynptr_regno) {
-				verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
-				return -EFAULT;
-			}
-
-			meta->uninit_dynptr_regno = regno;
-		} else if (!is_dynptr_reg_valid_init(env, reg)) {
-			verbose(env,
-				"Expected an initialized dynptr as arg #%d\n",
-				arg + 1);
-			return -EINVAL;
-		} else if (!is_dynptr_type_expected(env, reg, arg_type)) {
-			const char *err_extra = "";
-
-			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
-			case DYNPTR_TYPE_LOCAL:
-				err_extra = "local";
-				break;
-			case DYNPTR_TYPE_RINGBUF:
-				err_extra = "ringbuf";
-				break;
-			default:
-				err_extra = "<unknown>";
-				break;
-			}
-			verbose(env,
-				"Expected a dynptr of type %s as arg #%d\n",
-				err_extra, arg + 1);
-			return -EINVAL;
-		}
+		if (process_dynptr_func(env, regno, arg_type, meta))
+			return -EACCES;
 		break;
 	case ARG_CONST_ALLOC_SIZE_OR_ZERO:
 		if (!tnum_is_const(reg->var_off)) {
@@ -8829,22 +8842,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_DYNPTR:
-			if (reg->type != PTR_TO_STACK) {
-				verbose(env, "arg#%d expected pointer to stack\n", i);
-				return -EINVAL;
-			}
-
-			if (!is_dynptr_reg_valid_init(env, reg)) {
-				verbose(env, "arg#%d pointer type %s %s must be valid and initialized\n",
-					i, btf_type_str(ref_t), ref_tname);
+			if (reg->type != PTR_TO_STACK &&
+			    reg->type != PTR_TO_DYNPTR) {
+				verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
 				return -EINVAL;
 			}
 
-			if (!is_dynptr_type_expected(env, reg, ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) {
-				verbose(env, "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n",
-					i, btf_type_str(ref_t), ref_tname);
-				return -EINVAL;
-			}
+			ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR, NULL);
+			if (ret < 0)
+				return ret;
 			break;
 		case KF_ARG_PTR_TO_LIST_HEAD:
 			if (reg->type != PTR_TO_MAP_VALUE &&
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
index 55d641c1f126..a9229260a6ce 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
@@ -18,11 +18,8 @@ static struct {
 	const char *expected_verifier_err_msg;
 	int expected_runtime_err;
 } kfunc_dynptr_tests[] = {
-	{"dynptr_type_not_supp",
-	 "arg#0 pointer type STRUCT bpf_dynptr_kern points to unsupported dynamic pointer type", 0},
-	{"not_valid_dynptr",
-	 "arg#0 pointer type STRUCT bpf_dynptr_kern must be valid and initialized", 0},
-	{"not_ptr_to_stack", "arg#0 expected pointer to stack", 0},
+	{"not_valid_dynptr", "Expected an initialized dynptr as arg #1", 0},
+	{"not_ptr_to_stack", "arg#0 expected pointer to stack or dynptr_ptr", 0},
 	{"dynptr_data_null", NULL, -EBADMSG},
 };
 
diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
index ce39d096bba3..f4a8250329b2 100644
--- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
+++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
@@ -32,18 +32,6 @@ int err, pid;
 
 char _license[] SEC("license") = "GPL";
 
-SEC("?lsm.s/bpf")
-int BPF_PROG(dynptr_type_not_supp, int cmd, union bpf_attr *attr,
-	     unsigned int size)
-{
-	char write_data[64] = "hello there, world!!";
-	struct bpf_dynptr ptr;
-
-	bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr);
-
-	return bpf_verify_pkcs7_signature(&ptr, &ptr, NULL);
-}
-
 SEC("?lsm.s/bpf")
 int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size)
 {
-- 
cgit v1.2.3


From 270605317366e4535d8d9fc3d9da1ad0fb3c9d45 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 8 Dec 2022 02:11:37 +0530
Subject: bpf: Rework process_dynptr_func

Recently, user ringbuf support introduced a PTR_TO_DYNPTR register type
for use in callback state, because in case of user ringbuf helpers,
there is no dynptr on the stack that is passed into the callback. To
reflect such a state, a special register type was created.

However, some checks have been bypassed incorrectly during the addition
of this feature. First, for arg_type with MEM_UNINIT flag which
initialize a dynptr, they must be rejected for such register type.
Secondly, in the future, there are plans to add dynptr helpers that
operate on the dynptr itself and may change its offset and other
properties.

In all of these cases, PTR_TO_DYNPTR shouldn't be allowed to be passed
to such helpers, however the current code simply returns 0.

The rejection for helpers that release the dynptr is already handled.

For fixing this, we take a step back and rework existing code in a way
that will allow fitting in all classes of helpers and have a coherent
model for dealing with the variety of use cases in which dynptr is used.

First, for ARG_PTR_TO_DYNPTR, it can either be set alone or together
with a DYNPTR_TYPE_* constant that denotes the only type it accepts.

Next, helpers which initialize a dynptr use MEM_UNINIT to indicate this
fact. To make the distinction clear, use MEM_RDONLY flag to indicate
that the helper only operates on the memory pointed to by the dynptr,
not the dynptr itself. In C parlance, it would be equivalent to taking
the dynptr as a point to const argument.

When either of these flags are not present, the helper is allowed to
mutate both the dynptr itself and also the memory it points to.
Currently, the read only status of the memory is not tracked in the
dynptr, but it would be trivial to add this support inside dynptr state
of the register.

With these changes and renaming PTR_TO_DYNPTR to CONST_PTR_TO_DYNPTR to
better reflect its usage, it can no longer be passed to helpers that
initialize a dynptr, i.e. bpf_dynptr_from_mem, bpf_ringbuf_reserve_dynptr.

A note to reviewers is that in code that does mark_stack_slots_dynptr,
and unmark_stack_slots_dynptr, we implicitly rely on the fact that
PTR_TO_STACK reg is the only case that can reach that code path, as one
cannot pass CONST_PTR_TO_DYNPTR to helpers that don't set MEM_RDONLY. In
both cases such helpers won't be setting that flag.

The next patch will add a couple of selftest cases to make sure this
doesn't break.

Fixes: 205715673844 ("bpf: Add bpf_user_ringbuf_drain() helper")
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221207204141.308952-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |   4 +-
 include/uapi/linux/bpf.h                           |   8 +-
 kernel/bpf/helpers.c                               |  18 +-
 kernel/bpf/verifier.c                              | 227 +++++++++++++++------
 scripts/bpf_doc.py                                 |   1 +
 tools/include/uapi/linux/bpf.h                     |   8 +-
 .../selftests/bpf/prog_tests/user_ringbuf.c        |   4 +-
 7 files changed, 191 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4920ac252754..3de24cfb7a3d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -775,7 +775,7 @@ enum bpf_reg_type {
 	PTR_TO_MEM,		 /* reg points to valid memory region */
 	PTR_TO_BUF,		 /* reg points to a read/write buffer */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
-	PTR_TO_DYNPTR,		 /* reg points to a dynptr */
+	CONST_PTR_TO_DYNPTR,	 /* reg points to a const struct bpf_dynptr */
 	__BPF_REG_TYPE_MAX,
 
 	/* Extended reg_types. */
@@ -2828,7 +2828,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 		     enum bpf_dynptr_type type, u32 offset, u32 size);
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 int bpf_dynptr_check_size(u32 size);
-u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr);
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr);
 
 #ifdef CONFIG_BPF_LSM
 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f89de51a45db..464ca3f01fe7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5293,7 +5293,7 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
+ * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
@@ -5303,7 +5303,7 @@ union bpf_attr {
  *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
  *		*flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
@@ -5313,7 +5313,7 @@ union bpf_attr {
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
  *		is a read-only dynptr or if *flags* is not 0.
  *
- * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
  *		Get a pointer to the underlying dynptr data.
  *
@@ -5414,7 +5414,7 @@ union bpf_attr {
  *		Drain samples from the specified user ring buffer, and invoke
  *		the provided callback for each such sample:
  *
- *		long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ *		long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx);
  *
  *		If **callback_fn** returns 0, the helper will continue to try
  *		and drain the next sample, up to a maximum of
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6ed5875b97a3..fa1093d4ad6b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1404,7 +1404,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
 #define DYNPTR_SIZE_MASK	0xFFFFFF
 #define DYNPTR_RDONLY_BIT	BIT(31)
 
-static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_RDONLY_BIT;
 }
@@ -1414,7 +1414,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ
 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
 }
 
-u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_SIZE_MASK;
 }
@@ -1438,7 +1438,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
 	memset(ptr, 0, sizeof(*ptr));
 }
 
-static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
 {
 	u32 size = bpf_dynptr_get_size(ptr);
 
@@ -1483,7 +1483,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
 };
 
-BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
 	   u32, offset, u64, flags)
 {
 	int err;
@@ -1506,12 +1506,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
-	.arg3_type	= ARG_PTR_TO_DYNPTR,
+	.arg3_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
 	.arg4_type	= ARG_ANYTHING,
 	.arg5_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
 	   u32, len, u64, flags)
 {
 	int err;
@@ -1532,14 +1532,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
 	.func		= bpf_dynptr_write,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_DYNPTR,
+	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg5_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
 {
 	int err;
 
@@ -1560,7 +1560,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = {
 	.func		= bpf_dynptr_data,
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_DYNPTR_MEM_OR_NULL,
-	.arg1_type	= ARG_PTR_TO_DYNPTR,
+	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eb742ac75844..a880776bd999 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -592,7 +592,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 		[PTR_TO_BUF]		= "buf",
 		[PTR_TO_FUNC]		= "func",
 		[PTR_TO_MAP_KEY]	= "map_key",
-		[PTR_TO_DYNPTR]		= "dynptr_ptr",
+		[CONST_PTR_TO_DYNPTR]	= "dynptr_ptr",
 	};
 
 	if (type & PTR_MAYBE_NULL) {
@@ -725,6 +725,28 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
 	return type == BPF_DYNPTR_TYPE_RINGBUF;
 }
 
+static void __mark_dynptr_reg(struct bpf_reg_state *reg,
+			      enum bpf_dynptr_type type,
+			      bool first_slot);
+
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+				struct bpf_reg_state *reg);
+
+static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1,
+				   struct bpf_reg_state *sreg2,
+				   enum bpf_dynptr_type type)
+{
+	__mark_dynptr_reg(sreg1, type, true);
+	__mark_dynptr_reg(sreg2, type, false);
+}
+
+static void mark_dynptr_cb_reg(struct bpf_reg_state *reg,
+			       enum bpf_dynptr_type type)
+{
+	__mark_dynptr_reg(reg, type, true);
+}
+
+
 static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 				   enum bpf_arg_type arg_type, int insn_idx)
 {
@@ -746,9 +768,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 	if (type == BPF_DYNPTR_TYPE_INVALID)
 		return -EINVAL;
 
-	state->stack[spi].spilled_ptr.dynptr.first_slot = true;
-	state->stack[spi].spilled_ptr.dynptr.type = type;
-	state->stack[spi - 1].spilled_ptr.dynptr.type = type;
+	mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr,
+			       &state->stack[spi - 1].spilled_ptr, type);
 
 	if (dynptr_type_refcounted(type)) {
 		/* The id is used to track proper releasing */
@@ -756,8 +777,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 		if (id < 0)
 			return id;
 
-		state->stack[spi].spilled_ptr.id = id;
-		state->stack[spi - 1].spilled_ptr.id = id;
+		state->stack[spi].spilled_ptr.ref_obj_id = id;
+		state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
 	}
 
 	return 0;
@@ -779,25 +800,23 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
 	}
 
 	/* Invalidate any slices associated with this dynptr */
-	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
-		release_reference(env, state->stack[spi].spilled_ptr.id);
-		state->stack[spi].spilled_ptr.id = 0;
-		state->stack[spi - 1].spilled_ptr.id = 0;
-	}
-
-	state->stack[spi].spilled_ptr.dynptr.first_slot = false;
-	state->stack[spi].spilled_ptr.dynptr.type = 0;
-	state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
+	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type))
+		WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id));
 
+	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
 	return 0;
 }
 
 static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
-	int spi = get_spi(reg->off);
-	int i;
+	int spi, i;
+
+	if (reg->type == CONST_PTR_TO_DYNPTR)
+		return false;
 
+	spi = get_spi(reg->off);
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
 		return true;
 
@@ -813,9 +832,14 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
 static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
-	int spi = get_spi(reg->off);
+	int spi;
 	int i;
 
+	/* This already represents first slot of initialized bpf_dynptr */
+	if (reg->type == CONST_PTR_TO_DYNPTR)
+		return true;
+
+	spi = get_spi(reg->off);
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
 	    !state->stack[spi].spilled_ptr.dynptr.first_slot)
 		return false;
@@ -834,15 +858,19 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 {
 	struct bpf_func_state *state = func(env, reg);
 	enum bpf_dynptr_type dynptr_type;
-	int spi = get_spi(reg->off);
+	int spi;
 
 	/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
 	if (arg_type == ARG_PTR_TO_DYNPTR)
 		return true;
 
 	dynptr_type = arg_to_dynptr_type(arg_type);
-
-	return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+	if (reg->type == CONST_PTR_TO_DYNPTR) {
+		return reg->dynptr.type == dynptr_type;
+	} else {
+		spi = get_spi(reg->off);
+		return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+	}
 }
 
 /* The reg state of a pointer or a bounded scalar was saved when
@@ -1354,9 +1382,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
 	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
 };
 
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
-				struct bpf_reg_state *reg);
-
 /* This helper doesn't clear reg->id */
 static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
 {
@@ -1419,6 +1444,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
 	__mark_reg_known_zero(regs + regno);
 }
 
+static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
+			      bool first_slot)
+{
+	/* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
+	 * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
+	 * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
+	 */
+	__mark_reg_known_zero(reg);
+	reg->type = CONST_PTR_TO_DYNPTR;
+	reg->dynptr.type = type;
+	reg->dynptr.first_slot = first_slot;
+}
+
 static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
 {
 	if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
@@ -5857,19 +5895,58 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+ * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
+ *
+ * In both cases we deal with the first 8 bytes, but need to mark the next 8
+ * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
+ * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
+ *
+ * Mutability of bpf_dynptr is at two levels, one is at the level of struct
+ * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
+ * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
+ * mutate the view of the dynptr and also possibly destroy it. In the latter
+ * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
+ * memory that dynptr points to.
+ *
+ * The verifier will keep track both levels of mutation (bpf_dynptr's in
+ * reg->type and the memory's in reg->dynptr.type), but there is no support for
+ * readonly dynptr view yet, hence only the first case is tracked and checked.
+ *
+ * This is consistent with how C applies the const modifier to a struct object,
+ * where the pointer itself inside bpf_dynptr becomes const but not what it
+ * points to.
+ *
+ * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
+ * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ */
 int process_dynptr_func(struct bpf_verifier_env *env, int regno,
-			enum bpf_arg_type arg_type,
-			struct bpf_call_arg_meta *meta)
+			enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 
-	/* We only need to check for initialized / uninitialized helper
-	 * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
-	 * assumption is that if it is, that a helper function
-	 * initialized the dynptr on behalf of the BPF program.
+	/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
+	 * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
+	 */
+	if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
+		verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
+		return -EFAULT;
+	}
+	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for
+	 *		 constructing a mutable bpf_dynptr object.
+	 *
+	 *		 Currently, this is only possible with PTR_TO_STACK
+	 *		 pointing to a region of at least 16 bytes which doesn't
+	 *		 contain an existing bpf_dynptr.
+	 *
+	 *  MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
+	 *		 mutated or destroyed. However, the memory it points to
+	 *		 may be mutated.
+	 *
+	 *  None       - Points to a initialized dynptr that can be mutated and
+	 *		 destroyed, including mutation of the memory it points
+	 *		 to.
 	 */
-	if (base_type(reg->type) == PTR_TO_DYNPTR)
-		return 0;
 	if (arg_type & MEM_UNINIT) {
 		if (!is_dynptr_reg_valid_uninit(env, reg)) {
 			verbose(env, "Dynptr has to be an uninitialized dynptr\n");
@@ -5885,7 +5962,13 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 		}
 
 		meta->uninit_dynptr_regno = regno;
-	} else {
+	} else /* MEM_RDONLY and None case from above */ {
+		/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
+		if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
+			verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+			return -EINVAL;
+		}
+
 		if (!is_dynptr_reg_valid_init(env, reg)) {
 			verbose(env,
 				"Expected an initialized dynptr as arg #%d\n",
@@ -5893,7 +5976,8 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 			return -EINVAL;
 		}
 
-		if (!is_dynptr_type_expected(env, reg, arg_type)) {
+		/* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
+		if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
 			const char *err_extra = "";
 
 			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
@@ -6056,7 +6140,7 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }
 static const struct bpf_reg_types dynptr_types = {
 	.types = {
 		PTR_TO_STACK,
-		PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL,
+		CONST_PTR_TO_DYNPTR,
 	}
 };
 
@@ -6241,12 +6325,16 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
 }
 
-static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
-	int spi = get_spi(reg->off);
+	int spi;
+
+	if (reg->type == CONST_PTR_TO_DYNPTR)
+		return reg->ref_obj_id;
 
-	return state->stack[spi].spilled_ptr.id;
+	spi = get_spi(reg->off);
+	return state->stack[spi].spilled_ptr.ref_obj_id;
 }
 
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
@@ -6311,11 +6399,22 @@ skip_type_check:
 	if (arg_type_is_release(arg_type)) {
 		if (arg_type_is_dynptr(arg_type)) {
 			struct bpf_func_state *state = func(env, reg);
-			int spi = get_spi(reg->off);
+			int spi;
 
-			if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
-			    !state->stack[spi].spilled_ptr.id) {
-				verbose(env, "arg %d is an unacquired reference\n", regno);
+			/* Only dynptr created on stack can be released, thus
+			 * the get_spi and stack state checks for spilled_ptr
+			 * should only be done before process_dynptr_func for
+			 * PTR_TO_STACK.
+			 */
+			if (reg->type == PTR_TO_STACK) {
+				spi = get_spi(reg->off);
+				if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+				    !state->stack[spi].spilled_ptr.ref_obj_id) {
+					verbose(env, "arg %d is an unacquired reference\n", regno);
+					return -EINVAL;
+				}
+			} else {
+				verbose(env, "cannot release unowned const bpf_dynptr\n");
 				return -EINVAL;
 			}
 		} else if (!reg->ref_obj_id && !register_is_null(reg)) {
@@ -7289,11 +7388,10 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
 {
 	/* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
 	 *			  callback_ctx, u64 flags);
-	 * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx);
+	 * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
 	 */
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
-	callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL;
-	__mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+	mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
 	callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
 
 	/* unused */
@@ -7687,7 +7785,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	regs = cur_regs(env);
 
+	/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+	 * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr
+	 * is safe to do directly.
+	 */
 	if (meta.uninit_dynptr_regno) {
+		if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) {
+			verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n");
+			return -EFAULT;
+		}
 		/* we write BPF_DW bits (8 bytes) at a time */
 		for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
 			err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
@@ -7705,15 +7811,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	if (meta.release_regno) {
 		err = -EINVAL;
-		if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1]))
+		/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+		 * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+		 * is safe to do directly.
+		 */
+		if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
+			if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
+				verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n");
+				return -EFAULT;
+			}
 			err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
-		else if (meta.ref_obj_id)
+		} else if (meta.ref_obj_id) {
 			err = release_reference(env, meta.ref_obj_id);
-		/* meta.ref_obj_id can only be 0 if register that is meant to be
-		 * released is NULL, which must be > R0.
-		 */
-		else if (register_is_null(&regs[meta.release_regno]))
+		} else if (register_is_null(&regs[meta.release_regno])) {
+			/* meta.ref_obj_id can only be 0 if register that is meant to be
+			 * released is NULL, which must be > R0.
+			 */
 			err = 0;
+		}
 		if (err) {
 			verbose(env, "func %s#%d reference has not been acquired before\n",
 				func_id_name(func_id), func_id);
@@ -7787,11 +7902,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 					return -EFAULT;
 				}
 
-				if (base_type(reg->type) != PTR_TO_DYNPTR)
-					/* Find the id of the dynptr we're
-					 * tracking the reference of
-					 */
-					meta.ref_obj_id = stack_slot_get_id(env, reg);
+				meta.ref_obj_id = dynptr_ref_obj_id(env, reg);
 				break;
 			}
 		}
@@ -8848,12 +8959,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			break;
 		case KF_ARG_PTR_TO_DYNPTR:
 			if (reg->type != PTR_TO_STACK &&
-			    reg->type != PTR_TO_DYNPTR) {
+			    reg->type != CONST_PTR_TO_DYNPTR) {
 				verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
 				return -EINVAL;
 			}
 
-			ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR, NULL);
+			ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL);
 			if (ret < 0)
 				return ret;
 			break;
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index fdb0aff8cb5a..e8d90829f23e 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -752,6 +752,7 @@ class PrinterHelpers(Printer):
             'struct bpf_timer',
             'struct mptcp_sock',
             'struct bpf_dynptr',
+            'const struct bpf_dynptr',
             'struct iphdr',
             'struct ipv6hdr',
     }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f89de51a45db..464ca3f01fe7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5293,7 +5293,7 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
+ * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
@@ -5303,7 +5303,7 @@ union bpf_attr {
  *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
  *		*flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
@@ -5313,7 +5313,7 @@ union bpf_attr {
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
  *		is a read-only dynptr or if *flags* is not 0.
  *
- * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
  *		Get a pointer to the underlying dynptr data.
  *
@@ -5414,7 +5414,7 @@ union bpf_attr {
  *		Drain samples from the specified user ring buffer, and invoke
  *		the provided callback for each such sample:
  *
- *		long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ *		long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx);
  *
  *		If **callback_fn** returns 0, the helper will continue to try
  *		and drain the next sample, up to a maximum of
diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
index 02b18d018b36..aefa0a474e58 100644
--- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
+++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
@@ -673,8 +673,8 @@ static struct {
 	{"user_ringbuf_callback_write_forbidden", "invalid mem access 'dynptr_ptr'"},
 	{"user_ringbuf_callback_null_context_write", "invalid mem access 'scalar'"},
 	{"user_ringbuf_callback_null_context_read", "invalid mem access 'scalar'"},
-	{"user_ringbuf_callback_discard_dynptr", "arg 1 is an unacquired reference"},
-	{"user_ringbuf_callback_submit_dynptr", "arg 1 is an unacquired reference"},
+	{"user_ringbuf_callback_discard_dynptr", "cannot release unowned const bpf_dynptr"},
+	{"user_ringbuf_callback_submit_dynptr", "cannot release unowned const bpf_dynptr"},
 	{"user_ringbuf_callback_invalid_return", "At callback return the register R0 has value"},
 };
 
-- 
cgit v1.2.3


From e47877c7aa821c413b45e05f804819579bdfa1a3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Dec 2022 11:36:32 -1000
Subject: rhashtable: Allow rhashtable to be used from irq-safe contexts

rhashtable currently only does bh-safe synchronization making it impossible
to use from irq-safe contexts. Switch it to use irq-safe synchronization to
remove the restriction.

v2: Update the lock functions to return the ulong flags value and unlock
    functions to take the value directly instead of passing around the
    pointer. Suggested by Linus.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 61 +++++++++++++++++++++++++++-------------------
 lib/rhashtable.c           | 16 +++++++-----
 2 files changed, 46 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 68dab3e08aad..5b5357c0bd8c 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -323,29 +323,36 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert(
  * When we write to a bucket without unlocking, we use rht_assign_locked().
  */
 
-static inline void rht_lock(struct bucket_table *tbl,
-			    struct rhash_lock_head __rcu **bkt)
+static inline unsigned long rht_lock(struct bucket_table *tbl,
+				     struct rhash_lock_head __rcu **bkt)
 {
-	local_bh_disable();
+	unsigned long flags;
+
+	local_irq_save(flags);
 	bit_spin_lock(0, (unsigned long *)bkt);
 	lock_map_acquire(&tbl->dep_map);
+	return flags;
 }
 
-static inline void rht_lock_nested(struct bucket_table *tbl,
-				   struct rhash_lock_head __rcu **bucket,
-				   unsigned int subclass)
+static inline unsigned long rht_lock_nested(struct bucket_table *tbl,
+					struct rhash_lock_head __rcu **bucket,
+					unsigned int subclass)
 {
-	local_bh_disable();
+	unsigned long flags;
+
+	local_irq_save(flags);
 	bit_spin_lock(0, (unsigned long *)bucket);
 	lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
+	return flags;
 }
 
 static inline void rht_unlock(struct bucket_table *tbl,
-			      struct rhash_lock_head __rcu **bkt)
+			      struct rhash_lock_head __rcu **bkt,
+			      unsigned long flags)
 {
 	lock_map_release(&tbl->dep_map);
 	bit_spin_unlock(0, (unsigned long *)bkt);
-	local_bh_enable();
+	local_irq_restore(flags);
 }
 
 static inline struct rhash_head *__rht_ptr(
@@ -393,7 +400,8 @@ static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
 
 static inline void rht_assign_unlock(struct bucket_table *tbl,
 				     struct rhash_lock_head __rcu **bkt,
-				     struct rhash_head *obj)
+				     struct rhash_head *obj,
+				     unsigned long flags)
 {
 	if (rht_is_a_nulls(obj))
 		obj = NULL;
@@ -401,7 +409,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
 	rcu_assign_pointer(*bkt, (void *)obj);
 	preempt_enable();
 	__release(bitlock);
-	local_bh_enable();
+	local_irq_restore(flags);
 }
 
 /**
@@ -706,6 +714,7 @@ static inline void *__rhashtable_insert_fast(
 	struct rhash_head __rcu **pprev;
 	struct bucket_table *tbl;
 	struct rhash_head *head;
+	unsigned long flags;
 	unsigned int hash;
 	int elasticity;
 	void *data;
@@ -720,11 +729,11 @@ static inline void *__rhashtable_insert_fast(
 	if (!bkt)
 		goto out;
 	pprev = NULL;
-	rht_lock(tbl, bkt);
+	flags = rht_lock(tbl, bkt);
 
 	if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
 slow_path:
-		rht_unlock(tbl, bkt);
+		rht_unlock(tbl, bkt, flags);
 		rcu_read_unlock();
 		return rhashtable_insert_slow(ht, key, obj);
 	}
@@ -756,9 +765,9 @@ slow_path:
 		RCU_INIT_POINTER(list->rhead.next, head);
 		if (pprev) {
 			rcu_assign_pointer(*pprev, obj);
-			rht_unlock(tbl, bkt);
+			rht_unlock(tbl, bkt, flags);
 		} else
-			rht_assign_unlock(tbl, bkt, obj);
+			rht_assign_unlock(tbl, bkt, obj, flags);
 		data = NULL;
 		goto out;
 	}
@@ -785,7 +794,7 @@ slow_path:
 	}
 
 	atomic_inc(&ht->nelems);
-	rht_assign_unlock(tbl, bkt, obj);
+	rht_assign_unlock(tbl, bkt, obj, flags);
 
 	if (rht_grow_above_75(ht, tbl))
 		schedule_work(&ht->run_work);
@@ -797,7 +806,7 @@ out:
 	return data;
 
 out_unlock:
-	rht_unlock(tbl, bkt);
+	rht_unlock(tbl, bkt, flags);
 	goto out;
 }
 
@@ -991,6 +1000,7 @@ static inline int __rhashtable_remove_fast_one(
 	struct rhash_lock_head __rcu **bkt;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
+	unsigned long flags;
 	unsigned int hash;
 	int err = -ENOENT;
 
@@ -999,7 +1009,7 @@ static inline int __rhashtable_remove_fast_one(
 	if (!bkt)
 		return -ENOENT;
 	pprev = NULL;
-	rht_lock(tbl, bkt);
+	flags = rht_lock(tbl, bkt);
 
 	rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
 		struct rhlist_head *list;
@@ -1043,14 +1053,14 @@ static inline int __rhashtable_remove_fast_one(
 
 		if (pprev) {
 			rcu_assign_pointer(*pprev, obj);
-			rht_unlock(tbl, bkt);
+			rht_unlock(tbl, bkt, flags);
 		} else {
-			rht_assign_unlock(tbl, bkt, obj);
+			rht_assign_unlock(tbl, bkt, obj, flags);
 		}
 		goto unlocked;
 	}
 
-	rht_unlock(tbl, bkt);
+	rht_unlock(tbl, bkt, flags);
 unlocked:
 	if (err > 0) {
 		atomic_dec(&ht->nelems);
@@ -1143,6 +1153,7 @@ static inline int __rhashtable_replace_fast(
 	struct rhash_lock_head __rcu **bkt;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
+	unsigned long flags;
 	unsigned int hash;
 	int err = -ENOENT;
 
@@ -1158,7 +1169,7 @@ static inline int __rhashtable_replace_fast(
 		return -ENOENT;
 
 	pprev = NULL;
-	rht_lock(tbl, bkt);
+	flags = rht_lock(tbl, bkt);
 
 	rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
 		if (he != obj_old) {
@@ -1169,15 +1180,15 @@ static inline int __rhashtable_replace_fast(
 		rcu_assign_pointer(obj_new->next, obj_old->next);
 		if (pprev) {
 			rcu_assign_pointer(*pprev, obj_new);
-			rht_unlock(tbl, bkt);
+			rht_unlock(tbl, bkt, flags);
 		} else {
-			rht_assign_unlock(tbl, bkt, obj_new);
+			rht_assign_unlock(tbl, bkt, obj_new, flags);
 		}
 		err = 0;
 		goto unlocked;
 	}
 
-	rht_unlock(tbl, bkt);
+	rht_unlock(tbl, bkt, flags);
 
 unlocked:
 	return err;
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e12bbfb240b8..6ae2ba8e06a2 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -231,6 +231,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht,
 	struct rhash_head *head, *next, *entry;
 	struct rhash_head __rcu **pprev = NULL;
 	unsigned int new_hash;
+	unsigned long flags;
 
 	if (new_tbl->nest)
 		goto out;
@@ -253,13 +254,14 @@ static int rhashtable_rehash_one(struct rhashtable *ht,
 
 	new_hash = head_hashfn(ht, new_tbl, entry);
 
-	rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], SINGLE_DEPTH_NESTING);
+	flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash],
+				SINGLE_DEPTH_NESTING);
 
 	head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);
 
 	RCU_INIT_POINTER(entry->next, head);
 
-	rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);
+	rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags);
 
 	if (pprev)
 		rcu_assign_pointer(*pprev, next);
@@ -276,18 +278,19 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
+	unsigned long flags;
 	int err;
 
 	if (!bkt)
 		return 0;
-	rht_lock(old_tbl, bkt);
+	flags = rht_lock(old_tbl, bkt);
 
 	while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
 		;
 
 	if (err == -ENOENT)
 		err = 0;
-	rht_unlock(old_tbl, bkt);
+	rht_unlock(old_tbl, bkt, flags);
 
 	return err;
 }
@@ -590,6 +593,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 	struct bucket_table *new_tbl;
 	struct bucket_table *tbl;
 	struct rhash_lock_head __rcu **bkt;
+	unsigned long flags;
 	unsigned int hash;
 	void *data;
 
@@ -607,7 +611,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 			new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 			data = ERR_PTR(-EAGAIN);
 		} else {
-			rht_lock(tbl, bkt);
+			flags = rht_lock(tbl, bkt);
 			data = rhashtable_lookup_one(ht, bkt, tbl,
 						     hash, key, obj);
 			new_tbl = rhashtable_insert_one(ht, bkt, tbl,
@@ -615,7 +619,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 			if (PTR_ERR(new_tbl) != -EEXIST)
 				data = ERR_CAST(new_tbl);
 
-			rht_unlock(tbl, bkt);
+			rht_unlock(tbl, bkt, flags);
 		}
 	} while (!IS_ERR_OR_NULL(new_tbl));
 
-- 
cgit v1.2.3


From ce098da1497c6dee9589fce2c61d1910f4fcf0e7 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 7 Dec 2022 22:02:59 -0800
Subject: skbuff: Introduce slab_build_skb()

syzkaller reported:

  BUG: KASAN: slab-out-of-bounds in __build_skb_around+0x235/0x340 net/core/skbuff.c:294
  Write of size 32 at addr ffff88802aa172c0 by task syz-executor413/5295

For bpf_prog_test_run_skb(), which uses a kmalloc()ed buffer passed to
build_skb().

When build_skb() is passed a frag_size of 0, it means the buffer came
from kmalloc. In these cases, ksize() is used to find its actual size,
but since the allocation may not have been made to that size, actually
perform the krealloc() call so that all the associated buffer size
checking will be correctly notified (and use the "new" pointer so that
compiler hinting works correctly). Split this logic out into a new
interface, slab_build_skb(), but leave the original 0 checking for now
to catch any stragglers.

Reported-by: syzbot+fda18eaa8c12534ccb3b@syzkaller.appspotmail.com
Link: https://groups.google.com/g/syzkaller-bugs/c/UnIKxTtU5-0/m/-wbXinkgAQAJ
Fixes: 38931d8989b5 ("mm: Make ksize() a reporting-only function")
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: pepsipu <soopthegoop@gmail.com>
Cc: syzbot+fda18eaa8c12534ccb3b@syzkaller.appspotmail.com
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: kasan-dev <kasan-dev@googlegroups.com>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: ast@kernel.org
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Hao Luo <haoluo@google.com>
Cc: Jesper Dangaard Brouer <hawk@kernel.org>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: jolsa@kernel.org
Cc: KP Singh <kpsingh@kernel.org>
Cc: martin.lau@linux.dev
Cc: Stanislav Fomichev <sdf@google.com>
Cc: song@kernel.org
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20221208060256.give.994-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnx2.c      |  2 +-
 drivers/net/ethernet/qlogic/qed/qed_ll2.c |  2 +-
 include/linux/skbuff.h                    |  1 +
 net/bpf/test_run.c                        |  2 +-
 net/core/skbuff.c                         | 70 +++++++++++++++++++++++++++----
 5 files changed, 66 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index dbe310144780..9f473854b0f4 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -3045,7 +3045,7 @@ error:
 
 	dma_unmap_single(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size,
 			 DMA_FROM_DEVICE);
-	skb = build_skb(data, 0);
+	skb = slab_build_skb(data);
 	if (!skb) {
 		kfree(data);
 		goto error;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index ed274f033626..e5116a86cfbc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -200,7 +200,7 @@ static void qed_ll2b_complete_rx_packet(void *cxt,
 	dma_unmap_single(&cdev->pdev->dev, buffer->phys_addr,
 			 cdev->ll2->rx_size, DMA_FROM_DEVICE);
 
-	skb = build_skb(buffer->data, 0);
+	skb = slab_build_skb(buffer->data);
 	if (!skb) {
 		DP_INFO(cdev, "Failed to build SKB\n");
 		kfree(buffer->data);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4e464a27adaf..4c8492401a10 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1255,6 +1255,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
 void skb_attempt_defer_free(struct sk_buff *skb);
 
 struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
+struct sk_buff *slab_build_skb(void *data);
 
 /**
  * alloc_skb - allocate a network buffer
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 6094ef7cffcd..c9bfd263dcef 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -1128,7 +1128,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	}
 	sock_init_data(NULL, sk);
 
-	skb = build_skb(data, 0);
+	skb = slab_build_skb(data);
 	if (!skb) {
 		kfree(data);
 		kfree(ctx);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4bf95e36ed16..3cbba7099c0f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -270,12 +270,10 @@ static struct sk_buff *napi_skb_cache_get(void)
 	return skb;
 }
 
-/* Caller must provide SKB that is memset cleared */
-static void __build_skb_around(struct sk_buff *skb, void *data,
-			       unsigned int frag_size)
+static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
+					 unsigned int size)
 {
 	struct skb_shared_info *shinfo;
-	unsigned int size = frag_size ? : ksize(data);
 
 	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
@@ -297,15 +295,71 @@ static void __build_skb_around(struct sk_buff *skb, void *data,
 	skb_set_kcov_handle(skb, kcov_common_handle());
 }
 
+static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
+				     unsigned int *size)
+{
+	void *resized;
+
+	/* Must find the allocation size (and grow it to match). */
+	*size = ksize(data);
+	/* krealloc() will immediately return "data" when
+	 * "ksize(data)" is requested: it is the existing upper
+	 * bounds. As a result, GFP_ATOMIC will be ignored. Note
+	 * that this "new" pointer needs to be passed back to the
+	 * caller for use so the __alloc_size hinting will be
+	 * tracked correctly.
+	 */
+	resized = krealloc(data, *size, GFP_ATOMIC);
+	WARN_ON_ONCE(resized != data);
+	return resized;
+}
+
+/* build_skb() variant which can operate on slab buffers.
+ * Note that this should be used sparingly as slab buffers
+ * cannot be combined efficiently by GRO!
+ */
+struct sk_buff *slab_build_skb(void *data)
+{
+	struct sk_buff *skb;
+	unsigned int size;
+
+	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return NULL;
+
+	memset(skb, 0, offsetof(struct sk_buff, tail));
+	data = __slab_build_skb(skb, data, &size);
+	__finalize_skb_around(skb, data, size);
+
+	return skb;
+}
+EXPORT_SYMBOL(slab_build_skb);
+
+/* Caller must provide SKB that is memset cleared */
+static void __build_skb_around(struct sk_buff *skb, void *data,
+			       unsigned int frag_size)
+{
+	unsigned int size = frag_size;
+
+	/* frag_size == 0 is considered deprecated now. Callers
+	 * using slab buffer should use slab_build_skb() instead.
+	 */
+	if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
+		data = __slab_build_skb(skb, data, &size);
+
+	__finalize_skb_around(skb, data, size);
+}
+
 /**
  * __build_skb - build a network buffer
  * @data: data buffer provided by caller
- * @frag_size: size of data, or 0 if head was kmalloced
+ * @frag_size: size of data (must not be 0)
  *
  * Allocate a new &sk_buff. Caller provides space holding head and
- * skb_shared_info. @data must have been allocated by kmalloc() only if
- * @frag_size is 0, otherwise data should come from the page allocator
- *  or vmalloc()
+ * skb_shared_info. @data must have been allocated from the page
+ * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
+ * allocation is deprecated, and callers should use slab_build_skb()
+ * instead.)
  * The return is the new skb buffer.
  * On a failure the return is %NULL, and @data is not freed.
  * Notes :
-- 
cgit v1.2.3


From 5dd9cdbc9dec3e99b19e483767e247d15ca8cc0d Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Fri, 9 Dec 2022 15:57:29 +0200
Subject: bpf: states_equal() must build idmap for all function frames

verifier.c:states_equal() must maintain register ID mapping across all
function frames. Otherwise the following example might be erroneously
marked as safe:

main:
    fp[-24] = map_lookup_elem(...)  ; frame[0].fp[-24].id == 1
    fp[-32] = map_lookup_elem(...)  ; frame[0].fp[-32].id == 2
    r1 = &fp[-24]
    r2 = &fp[-32]
    call foo()
    r0 = 0
    exit

foo:
  0: r9 = r1
  1: r8 = r2
  2: r7 = ktime_get_ns()
  3: r6 = ktime_get_ns()
  4: if (r6 > r7) goto skip_assign
  5: r9 = r8

skip_assign:                ; <--- checkpoint
  6: r9 = *r9               ; (a) frame[1].r9.id == 2
                            ; (b) frame[1].r9.id == 1

  7: if r9 == 0 goto exit:  ; mark_ptr_or_null_regs() transfers != 0 info
                            ; for all regs sharing ID:
                            ;   (a) r9 != 0 => &frame[0].fp[-32] != 0
                            ;   (b) r9 != 0 => &frame[0].fp[-24] != 0

  8: r8 = *r8               ; (a) r8 == &frame[0].fp[-32]
                            ; (b) r8 == &frame[0].fp[-32]
  9: r0 = *r8               ; (a) safe
                            ; (b) unsafe

exit:
 10: exit

While processing call to foo() verifier considers the following
execution paths:

(a) 0-10
(b) 0-4,6-10
(There is also path 0-7,10 but it is not interesting for the issue at
 hand. (a) is verified first.)

Suppose that checkpoint is created at (6) when path (a) is verified,
next path (b) is verified and (6) is reached.

If states_equal() maintains separate 'idmap' for each frame the
mapping at (6) for frame[1] would be empty and
regsafe(r9)::check_ids() would add a pair 2->1 and return true,
which is an error.

If states_equal() maintains single 'idmap' for all frames the mapping
at (6) would be { 1->1, 2->2 } and regsafe(r9)::check_ids() would
return false when trying to add a pair 2->1.

This issue was suggested in the following discussion:
https://lore.kernel.org/bpf/CAEf4BzbFB5g4oUfyxk9rHy-PJSLQ3h8q9mV=rVoXfr_JVm8+1Q@mail.gmail.com/

Suggested-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20221209135733.28851-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 4 ++--
 kernel/bpf/verifier.c        | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index df0cb825e0e3..53d175cbaa02 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -273,9 +273,9 @@ struct bpf_id_pair {
 	u32 cur;
 };
 
-/* Maximum number of register states that can exist at once */
-#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
 #define MAX_CALL_FRAMES 8
+/* Maximum number of register states that can exist at once */
+#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES)
 struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 97645f767354..00c8e0a28203 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13260,7 +13260,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 {
 	int i;
 
-	memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (!regsafe(env, &old->regs[i], &cur->regs[i],
 			     env->idmap_scratch))
@@ -13284,6 +13283,8 @@ static bool states_equal(struct bpf_verifier_env *env,
 	if (old->curframe != cur->curframe)
 		return false;
 
+	memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+
 	/* Verification state from speculative execution simulation
 	 * must never prune a non-speculative execution one.
 	 */
-- 
cgit v1.2.3


From 860b7f27b8f78564ca5a2f607e0820b2d352a562 Mon Sep 17 00:00:00 2001
From: Andrew Melnychenko <andrew@daynix.com>
Date: Wed, 7 Dec 2022 13:35:57 +0200
Subject: linux/virtio_net.h: Support USO offload in vnet header.

Now, it's possible to convert USO vnet packets from/to skb.
Added support for GSO_UDP_L4 offload.

Signed-off-by: Andrew Melnychenko <andrew@daynix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index a960de68ac69..bdf8de2cdd93 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -15,6 +15,7 @@ static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type)
 	case VIRTIO_NET_HDR_GSO_TCPV6:
 		return protocol == cpu_to_be16(ETH_P_IPV6);
 	case VIRTIO_NET_HDR_GSO_UDP:
+	case VIRTIO_NET_HDR_GSO_UDP_L4:
 		return protocol == cpu_to_be16(ETH_P_IP) ||
 		       protocol == cpu_to_be16(ETH_P_IPV6);
 	default:
@@ -31,6 +32,7 @@ static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
 	switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
 	case VIRTIO_NET_HDR_GSO_TCPV4:
 	case VIRTIO_NET_HDR_GSO_UDP:
+	case VIRTIO_NET_HDR_GSO_UDP_L4:
 		skb->protocol = cpu_to_be16(ETH_P_IP);
 		break;
 	case VIRTIO_NET_HDR_GSO_TCPV6:
@@ -69,6 +71,11 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 			ip_proto = IPPROTO_UDP;
 			thlen = sizeof(struct udphdr);
 			break;
+		case VIRTIO_NET_HDR_GSO_UDP_L4:
+			gso_type = SKB_GSO_UDP_L4;
+			ip_proto = IPPROTO_UDP;
+			thlen = sizeof(struct udphdr);
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -182,6 +189,8 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 			hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
 		else if (sinfo->gso_type & SKB_GSO_TCPV6)
 			hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+		else if (sinfo->gso_type & SKB_GSO_UDP_L4)
+			hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4;
 		else
 			return -EINVAL;
 		if (sinfo->gso_type & SKB_GSO_TCP_ECN)
-- 
cgit v1.2.3


From 983055bf839745ec2812fc55cacd96888aa0d7a6 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Thu, 1 Dec 2022 02:46:54 +0900
Subject: USB: core: export usb_cache_string()

usb_cache_string() can also be useful for the drivers so export it.

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/all/20221130174658.29282-4-mailhol.vincent@wanadoo.fr
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/usb/core/message.c | 1 +
 drivers/usb/core/usb.h     | 1 -
 include/linux/usb.h        | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 4d59d927ae3e..127fac1af676 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1037,6 +1037,7 @@ char *usb_cache_string(struct usb_device *udev, int index)
 	}
 	return smallbuf;
 }
+EXPORT_SYMBOL_GPL(usb_cache_string);
 
 /*
  * usb_get_device_descriptor - (re)reads the device descriptor (usbcore)
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index 82538daac8b8..0eac7d4285d1 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -47,7 +47,6 @@ extern int usb_get_device_descriptor(struct usb_device *dev,
 extern int usb_set_isoch_delay(struct usb_device *dev);
 extern int usb_get_bos_descriptor(struct usb_device *dev);
 extern void usb_release_bos_descriptor(struct usb_device *dev);
-extern char *usb_cache_string(struct usb_device *udev, int index);
 extern int usb_set_configuration(struct usb_device *dev, int configuration);
 extern int usb_choose_configuration(struct usb_device *udev);
 extern int usb_generic_driver_probe(struct usb_device *udev);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 9ff1ad4dfad1..d2d2f41052c0 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1829,6 +1829,7 @@ static inline int usb_get_ptm_status(struct usb_device *dev, void *data)
 
 extern int usb_string(struct usb_device *dev, int index,
 	char *buf, size_t size);
+extern char *usb_cache_string(struct usb_device *udev, int index);
 
 /* wrappers that also update important state inside usbcore */
 extern int usb_clear_halt(struct usb_device *dev, int pipe);
-- 
cgit v1.2.3


From 8a321cf7becc6c065ae595b837b826a2a81036b9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 9 Dec 2022 10:21:38 -0500
Subject: net: add IFF_NO_ADDRCONF and use it in bonding to prevent ipv6
 addrconf

Currently, in bonding it reused the IFF_SLAVE flag and checked it
in ipv6 addrconf to prevent ipv6 addrconf.

However, it is not a proper flag to use for no ipv6 addrconf, for
bonding it has to move IFF_SLAVE flag setting ahead of dev_open()
in bond_enslave(). Also, IFF_MASTER/SLAVE are historical flags
used in bonding and eql, as Jiri mentioned, the new devices like
Team, Failover do not use this flag.

So as Jiri suggested, this patch adds IFF_NO_ADDRCONF in priv_flags
of the device to indicate no ipv6 addconf, and uses it in bonding
and moves IFF_SLAVE flag setting back to its original place.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_main.c | 18 +++++++++++++-----
 include/linux/netdevice.h       |  3 ++-
 net/ipv6/addrconf.c             |  4 ++--
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 4048876f842c..f7767afe116b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1632,13 +1632,19 @@ static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
 {
 	struct netdev_lag_upper_info lag_upper_info;
 	enum netdev_lag_tx_type type;
+	int err;
 
 	type = bond_lag_tx_type(bond);
 	lag_upper_info.tx_type = type;
 	lag_upper_info.hash_type = bond_lag_hash_type(bond, type);
 
-	return netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
-					    &lag_upper_info, extack);
+	err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
+					   &lag_upper_info, extack);
+	if (err)
+		return err;
+
+	slave->dev->flags |= IFF_SLAVE;
+	return 0;
 }
 
 static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave)
@@ -1950,8 +1956,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		}
 	}
 
-	/* set slave flag before open to prevent IPv6 addrconf */
-	slave_dev->flags |= IFF_SLAVE;
+	/* set no_addrconf flag before open to prevent IPv6 addrconf */
+	slave_dev->priv_flags |= IFF_NO_ADDRCONF;
 
 	/* open the slave since the application closed it */
 	res = dev_open(slave_dev, extack);
@@ -2254,7 +2260,7 @@ err_close:
 	dev_close(slave_dev);
 
 err_restore_mac:
-	slave_dev->flags &= ~IFF_SLAVE;
+	slave_dev->priv_flags &= ~IFF_NO_ADDRCONF;
 	if (!bond->params.fail_over_mac ||
 	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
 		/* XXX TODO - fom follow mode needs to change master's
@@ -2446,6 +2452,8 @@ static int __bond_release_one(struct net_device *bond_dev,
 	/* close slave before restoring its mac address */
 	dev_close(slave_dev);
 
+	slave_dev->priv_flags &= ~IFF_NO_ADDRCONF;
+
 	if (bond->params.fail_over_mac != BOND_FOM_ACTIVE ||
 	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
 		/* restore original ("permanent") mac address */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2287cb8eb9e4..aad12a179e54 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1662,6 +1662,7 @@ struct net_device_ops {
  * @IFF_FAILOVER: device is a failover master device
  * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
  * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
+ * @IFF_NO_ADDRCONF: prevent ipv6 addrconf
  * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
  *	skb_headlen(skb) == 0 (data starts from frag0)
  * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN
@@ -1697,7 +1698,7 @@ enum netdev_priv_flags {
 	IFF_FAILOVER			= 1<<27,
 	IFF_FAILOVER_SLAVE		= 1<<28,
 	IFF_L3MDEV_RX_HANDLER		= 1<<29,
-	/* was IFF_LIVE_RENAME_OK */
+	IFF_NO_ADDRCONF			= BIT_ULL(30),
 	IFF_TX_SKB_NO_LINEAR		= BIT_ULL(31),
 	IFF_CHANGE_PROTO_DOWN		= BIT_ULL(32),
 };
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 9c3f5202a97b..c338dfb05d2c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3320,7 +3320,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 		return;
 
 	/* no link local addresses on devices flagged as slaves */
-	if (idev->dev->flags & IFF_SLAVE)
+	if (idev->dev->priv_flags & IFF_NO_ADDRCONF)
 		return;
 
 	ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
@@ -3560,7 +3560,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 		if (idev && idev->cnf.disable_ipv6)
 			break;
 
-		if (dev->flags & IFF_SLAVE) {
+		if (dev->priv_flags & IFF_NO_ADDRCONF) {
 			if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) &&
 			    dev->flags & IFF_UP && dev->flags & IFF_MULTICAST)
 				ipv6_mc_up(idev);
-- 
cgit v1.2.3