From 30ca1aa536211f5ac3de0173513a7a99a98a97f3 Mon Sep 17 00:00:00 2001
From: Dedy Lansky <dlansky@codeaurora.org>
Date: Sun, 29 Jul 2018 14:59:16 +0300
Subject: cfg80211/mac80211: make ieee80211_send_layer2_update a public
 function

Make ieee80211_send_layer2_update() a common function so other drivers
can re-use it.

Signed-off-by: Dedy Lansky <dlansky@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9a850973e09a..4f57f770f602 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4732,6 +4732,17 @@ static inline const u8 *cfg80211_find_ext_ie(u8 ext_eid, const u8 *ies, int len)
 const u8 *cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
 				  const u8 *ies, int len);
 
+/**
+ * cfg80211_send_layer2_update - send layer 2 update frame
+ *
+ * @dev: network device
+ * @addr: STA MAC address
+ *
+ * Wireless drivers can use this function to update forwarding tables in bridge
+ * devices upon STA association.
+ */
+void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr);
+
 /**
  * DOC: Regulatory enforcement infrastructure
  *
-- 
cgit v1.2.3


From 21a5d4c3a45ca608477a083096cfbce76e449a0c Mon Sep 17 00:00:00 2001
From: Manikanta Pubbisetty <mpubbise@codeaurora.org>
Date: Wed, 11 Jul 2018 00:12:53 +0530
Subject: mac80211: add stop/start logic for software TXQs

Sometimes, it is required to stop the transmissions momentarily and
resume it later; stopping the txqs becomes very critical in scenarios where
the packet transmission has to be ceased completely. For example, during
the hardware restart, during off channel operations,
when initiating CSA(upon detecting a radar on the DFS channel), etc.

The TX queue stop/start logic in mac80211 works well in stopping the TX
when drivers make use of netdev queues, i.e, when Qdiscs in network layer
take care of traffic scheduling. Since the devices implementing
wake_tx_queue can run without Qdiscs, packets will be handed to mac80211
directly without queueing them in the netdev queues.

Also, mac80211 does not invoke any of the
netif_stop_*/netif_wake_* APIs if wake_tx_queue is implemented.
Since the queues are not stopped in this case, transmissions can continue
and this will impact negatively on the operation of the wireless device.

For example,
During hardware restart, we stop the netdev queues so that packets are
not sent to the driver. Since ath10k implements wake_tx_queue,
TX queues will not be stopped and packets might reach the hardware while
it is restarting; this can make hardware unresponsive and the only
possible option for recovery is to reboot the entire system.

There is another problem to this, it is observed that the packets
were sent on the DFS channel for a prolonged duration after radar
detection impacting the channel closing time.

We can still invoke netif stop/wake APIs when wake_tx_queue is implemented
but this could lead to packet drops in network layer; adding stop/start
logic for software TXQs in mac80211 instead makes more sense; the change
proposed adds the same in mac80211.

Signed-off-by: Manikanta Pubbisetty <mpubbise@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |   4 ++
 net/mac80211/ieee80211_i.h |   3 ++
 net/mac80211/main.c        |   4 ++
 net/mac80211/tx.c          |  11 ++++-
 net/mac80211/util.c        | 110 ++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 125 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5790f55c241d..e248f5fe5b19 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1504,6 +1504,8 @@ enum ieee80211_vif_flags {
  * @drv_priv: data area for driver use, will always be aligned to
  *	sizeof(void \*).
  * @txq: the multicast data TX queue (if driver uses the TXQ abstraction)
+ * @txqs_stopped: per AC flag to indicate that intermediate TXQs are stopped,
+ *	protected by fq->lock.
  */
 struct ieee80211_vif {
 	enum nl80211_iftype type;
@@ -1528,6 +1530,8 @@ struct ieee80211_vif {
 
 	unsigned int probe_req_reg;
 
+	bool txqs_stopped[IEEE80211_NUM_ACS];
+
 	/* must be last */
 	u8 drv_priv[0] __aligned(sizeof(void *));
 };
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 172aeae21ae9..348a52cefb43 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -818,6 +818,7 @@ enum txq_info_flags {
 	IEEE80211_TXQ_STOP,
 	IEEE80211_TXQ_AMPDU,
 	IEEE80211_TXQ_NO_AMSDU,
+	IEEE80211_TXQ_STOP_NETIF_TX,
 };
 
 /**
@@ -1226,6 +1227,7 @@ struct ieee80211_local {
 
 	struct sk_buff_head pending[IEEE80211_MAX_QUEUES];
 	struct tasklet_struct tx_pending_tasklet;
+	struct tasklet_struct wake_txqs_tasklet;
 
 	atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES];
 
@@ -2038,6 +2040,7 @@ void ieee80211_txq_remove_vlan(struct ieee80211_local *local,
 			       struct ieee80211_sub_if_data *sdata);
 void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
 			      struct txq_info *txqi);
+void ieee80211_wake_txqs(unsigned long data);
 void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 			 u16 transaction, u16 auth_alg, u16 status,
 			 const u8 *extra, size_t extra_len, const u8 *bssid,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 4fb2709cb527..e1215416a207 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -668,6 +668,10 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending,
 		     (unsigned long)local);
 
+	if (ops->wake_tx_queue)
+		tasklet_init(&local->wake_txqs_tasklet, ieee80211_wake_txqs,
+			     (unsigned long)local);
+
 	tasklet_init(&local->tasklet,
 		     ieee80211_tasklet_handler,
 		     (unsigned long) local);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index cd332e3e1134..093108077edc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3466,13 +3466,19 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	struct ieee80211_tx_info *info;
 	struct ieee80211_tx_data tx;
 	ieee80211_tx_result r;
-	struct ieee80211_vif *vif;
+	struct ieee80211_vif *vif = txq->vif;
 
 	spin_lock_bh(&fq->lock);
 
-	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
+	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags) ||
+	    test_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags))
 		goto out;
 
+	if (vif->txqs_stopped[ieee80211_ac_from_tid(txq->tid)]) {
+		set_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags);
+		goto out;
+	}
+
 	/* Make sure fragments stay together. */
 	skb = __skb_dequeue(&txqi->frags);
 	if (skb)
@@ -3567,6 +3573,7 @@ begin:
 	}
 
 	IEEE80211_SKB_CB(skb)->control.vif = vif;
+
 out:
 	spin_unlock_bh(&fq->lock);
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 88efda7c9f8a..d886789ff59e 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -240,6 +240,99 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL(ieee80211_ctstoself_duration);
 
+static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_vif *vif = &sdata->vif;
+	struct fq *fq = &local->fq;
+	struct ps_data *ps = NULL;
+	struct txq_info *txqi;
+	struct sta_info *sta;
+	int i;
+
+	spin_lock_bh(&fq->lock);
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP)
+		ps = &sdata->bss->ps;
+
+	sdata->vif.txqs_stopped[ac] = false;
+
+	list_for_each_entry_rcu(sta, &local->sta_list, list) {
+		if (sdata != sta->sdata)
+			continue;
+
+		for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+			struct ieee80211_txq *txq = sta->sta.txq[i];
+
+			txqi = to_txq_info(txq);
+
+			if (ac != txq->ac)
+				continue;
+
+			if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX,
+						&txqi->flags))
+				continue;
+
+			spin_unlock_bh(&fq->lock);
+			drv_wake_tx_queue(local, txqi);
+			spin_lock_bh(&fq->lock);
+		}
+	}
+
+	if (!vif->txq)
+		goto out;
+
+	txqi = to_txq_info(vif->txq);
+
+	if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags) ||
+	    (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac)
+		goto out;
+
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+	return;
+out:
+	spin_unlock_bh(&fq->lock);
+}
+
+void ieee80211_wake_txqs(unsigned long data)
+{
+	struct ieee80211_local *local = (struct ieee80211_local *)data;
+	struct ieee80211_sub_if_data *sdata;
+	int n_acs = IEEE80211_NUM_ACS;
+	unsigned long flags;
+	int i;
+
+	rcu_read_lock();
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+
+	if (local->hw.queues < IEEE80211_NUM_ACS)
+		n_acs = 1;
+
+	for (i = 0; i < local->hw.queues; i++) {
+		if (local->queue_stop_reasons[i])
+			continue;
+
+		spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+		list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+			int ac;
+
+			for (ac = 0; ac < n_acs; ac++) {
+				int ac_queue = sdata->vif.hw_queue[ac];
+
+				if (ac_queue == i ||
+				    sdata->vif.cab_queue == i)
+					__ieee80211_wake_txqs(sdata, ac);
+			}
+		}
+		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+	rcu_read_unlock();
+}
+
 void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
 {
 	struct ieee80211_sub_if_data *sdata;
@@ -308,6 +401,9 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
 		rcu_read_unlock();
 	} else
 		tasklet_schedule(&local->tx_pending_tasklet);
+
+	if (local->ops->wake_tx_queue)
+		tasklet_schedule(&local->wake_txqs_tasklet);
 }
 
 void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
@@ -351,9 +447,6 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
 	if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue]))
 		return;
 
-	if (local->ops->wake_tx_queue)
-		return;
-
 	if (local->hw.queues < IEEE80211_NUM_ACS)
 		n_acs = 1;
 
@@ -366,8 +459,15 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
 
 		for (ac = 0; ac < n_acs; ac++) {
 			if (sdata->vif.hw_queue[ac] == queue ||
-			    sdata->vif.cab_queue == queue)
-				netif_stop_subqueue(sdata->dev, ac);
+			    sdata->vif.cab_queue == queue) {
+				if (!local->ops->wake_tx_queue) {
+					netif_stop_subqueue(sdata->dev, ac);
+					continue;
+				}
+				spin_lock(&local->fq.lock);
+				sdata->vif.txqs_stopped[ac] = true;
+				spin_unlock(&local->fq.lock);
+			}
 		}
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 9cf0a0b4b64ae103cf0e7dfaa72b44ecda24c0eb Mon Sep 17 00:00:00 2001
From: Alexei Avshalom Lazar <ailizaro@codeaurora.org>
Date: Mon, 13 Aug 2018 15:33:00 +0300
Subject: cfg80211: Add support for 60GHz band channels 5 and 6

The current support in the 60GHz band is for channels 1-4.
Add support for channels 5 and 6.
This requires enlarging ieee80211_channel.center_freq from u16 to u32.

Signed-off-by: Alexei Avshalom Lazar <ailizaro@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/wil6210/debugfs.c | 2 +-
 include/net/cfg80211.h                     | 2 +-
 include/uapi/linux/nl80211.h               | 2 +-
 net/wireless/reg.c                         | 2 +-
 net/wireless/trace.h                       | 2 +-
 net/wireless/util.c                        | 6 +++---
 6 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/wil6210/debugfs.c b/drivers/net/wireless/ath/wil6210/debugfs.c
index 51c3330bc316..49533f884993 100644
--- a/drivers/net/wireless/ath/wil6210/debugfs.c
+++ b/drivers/net/wireless/ath/wil6210/debugfs.c
@@ -1436,7 +1436,7 @@ static int wil_freq_debugfs_show(struct seq_file *s, void *data)
 {
 	struct wil6210_priv *wil = s->private;
 	struct wireless_dev *wdev = wil->main_ndev->ieee80211_ptr;
-	u16 freq = wdev->chandef.chan ? wdev->chandef.chan->center_freq : 0;
+	u32 freq = wdev->chandef.chan ? wdev->chandef.chan->center_freq : 0;
 
 	seq_printf(s, "Freq = %d\n", freq);
 
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 4f57f770f602..46c4cbf54903 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -149,7 +149,7 @@ enum ieee80211_channel_flags {
  */
 struct ieee80211_channel {
 	enum nl80211_band band;
-	u16 center_freq;
+	u32 center_freq;
 	u16 hw_value;
 	u32 flags;
 	int max_antenna_gain;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 7acc16f34942..023989604fc6 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4338,7 +4338,7 @@ enum nl80211_txrate_gi {
  * enum nl80211_band - Frequency band
  * @NL80211_BAND_2GHZ: 2.4 GHz ISM band
  * @NL80211_BAND_5GHZ: around 5 GHz band (4.9 - 5.7 GHz)
- * @NL80211_BAND_60GHZ: around 60 GHz band (58.32 - 64.80 GHz)
+ * @NL80211_BAND_60GHZ: around 60 GHz band (58.32 - 69.12 GHz)
  * @NUM_NL80211_BANDS: number of bands, avoid using this in userspace
  *	since newer kernel versions may support more bands
  */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 325d74a94a36..043d43573ca2 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -3301,7 +3301,7 @@ void regulatory_hint_disconnect(void)
 	restore_regulatory_settings(false);
 }
 
-static bool freq_is_chan_12_13_14(u16 freq)
+static bool freq_is_chan_12_13_14(u32 freq)
 {
 	if (freq == ieee80211_channel_to_frequency(12, NL80211_BAND_2GHZ) ||
 	    freq == ieee80211_channel_to_frequency(13, NL80211_BAND_2GHZ) ||
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 7c73510b161f..5e7eec849200 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -112,7 +112,7 @@
 	} while (0)
 
 #define CHAN_ENTRY __field(enum nl80211_band, band) \
-		   __field(u16, center_freq)
+		   __field(u32, center_freq)
 #define CHAN_ASSIGN(chan)					  \
 	do {							  \
 		if (chan) {					  \
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 7bdcfe19b3ef..2a89db5f2db7 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -88,7 +88,7 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
 			return 5000 + chan * 5;
 		break;
 	case NL80211_BAND_60GHZ:
-		if (chan < 5)
+		if (chan < 7)
 			return 56160 + chan * 2160;
 		break;
 	default:
@@ -109,7 +109,7 @@ int ieee80211_frequency_to_channel(int freq)
 		return (freq - 4000) / 5;
 	else if (freq <= 45000) /* DMG band lower limit */
 		return (freq - 5000) / 5;
-	else if (freq >= 58320 && freq <= 64800)
+	else if (freq >= 58320 && freq <= 70200)
 		return (freq - 56160) / 2160;
 	else
 		return 0;
@@ -1568,7 +1568,7 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
 	}
 
 	/* 56.16 GHz, channel 1..4 */
-	if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 4) {
+	if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 6) {
 		if (chandef->width >= NL80211_CHAN_WIDTH_40)
 			return false;
 
-- 
cgit v1.2.3


From 9c06602b1b920ed6b546632bdbbc1f400eea5242 Mon Sep 17 00:00:00 2001
From: Balaji Pothunoori <bpothuno@codeaurora.org>
Date: Thu, 19 Jul 2018 18:56:27 +0530
Subject: cfg80211: clarify frames covered by average ACK signal report

Modify the API to include all ACK frames in average ACK
signal strength reporting, not just ACKs for data frames.
Make exposing the data conditional on implementing the
extended feature flag.

This is how it was really implemented in mac80211, update
the code there to use the new defines and clean up some of
the setting code.

Keep nl80211.h source compatibility by keeping the old names.

Signed-off-by: Balaji Pothunoori <bpothuno@codeaurora.org>
[rewrite commit log, change compatibility to be old=new
 instead of the other way around, update kernel-doc,
 roll in mac80211 changes, make mac80211 depend on valid
 bit instead of HW flag]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 18 +++++++++++-------
 net/mac80211/sta_info.c      |  6 +++---
 net/wireless/nl80211.c       |  7 ++++---
 3 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 023989604fc6..1766a12b231c 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3050,8 +3050,7 @@ enum nl80211_sta_bss_param {
  *	received from the station (u64, usec)
  * @NL80211_STA_INFO_PAD: attribute used for padding for 64-bit alignment
  * @NL80211_STA_INFO_ACK_SIGNAL: signal strength of the last ACK frame(u8, dBm)
- * @NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG: avg signal strength of (data)
- *	ACK frame (s8, dBm)
+ * @NL80211_STA_INFO_ACK_SIGNAL_AVG: avg signal strength of ACK frames (s8, dBm)
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
  */
@@ -3091,13 +3090,17 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_RX_DURATION,
 	NL80211_STA_INFO_PAD,
 	NL80211_STA_INFO_ACK_SIGNAL,
-	NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG,
+	NL80211_STA_INFO_ACK_SIGNAL_AVG,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
 	NL80211_STA_INFO_MAX = __NL80211_STA_INFO_AFTER_LAST - 1
 };
 
+/* we renamed this - stay compatible */
+#define NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG NL80211_STA_INFO_ACK_SIGNAL_AVG
+
+
 /**
  * enum nl80211_tid_stats - per TID statistics attributes
  * @__NL80211_TID_STATS_INVALID: attribute number 0 is reserved
@@ -5213,9 +5216,8 @@ enum nl80211_feature_flags {
  *	"radar detected" event.
  * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211: Driver supports sending and
  *	receiving control port frames over nl80211 instead of the netdevice.
- * @NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT: This Driver support data ack
- *	rssi if firmware support, this flag is to intimate about ack rssi
- *	support to nl80211.
+ * @NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT: This driver/device supports
+ *	(average) ACK signal strength reporting.
  * @NL80211_EXT_FEATURE_TXQS: Driver supports FQ-CoDel-enabled intermediate
  *      TXQs.
  * @NL80211_EXT_FEATURE_SCAN_RANDOM_SN: Driver/device supports randomizing the
@@ -5255,7 +5257,9 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN,
 	NL80211_EXT_FEATURE_DFS_OFFLOAD,
 	NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211,
-	NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT,
+	NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT,
+	/* we renamed this - stay compatible */
+	NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT = NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT,
 	NL80211_EXT_FEATURE_TXQS,
 	NL80211_EXT_FEATURE_SCAN_RANDOM_SN,
 	NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT,
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index f34202242d24..a231d623b2d2 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2323,13 +2323,13 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
 		sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL);
 	}
 
-	if (ieee80211_hw_check(&sta->local->hw, REPORTS_TX_ACK_STATUS) &&
-	    !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG))) {
+	if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) &&
+	    sta->status_stats.ack_signal_filled) {
 		sinfo->avg_ack_signal =
 			-(s8)ewma_avg_signal_read(
 				&sta->status_stats.avg_ack_signal);
 		sinfo->filled |=
-			BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG);
+			BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG);
 	}
 }
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5fb9b7dd9831..62e6679de481 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4724,10 +4724,11 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 	PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc);
 	PUT_SINFO_U64(BEACON_RX, rx_beacon);
 	PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
-	PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
 	if (wiphy_ext_feature_isset(&rdev->wiphy,
-				    NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT))
-		PUT_SINFO(DATA_ACK_SIGNAL_AVG, avg_ack_signal, s8);
+				    NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT)) {
+		PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
+		PUT_SINFO(ACK_SIGNAL_AVG, avg_ack_signal, s8);
+	}
 
 #undef PUT_SINFO
 #undef PUT_SINFO_U64
-- 
cgit v1.2.3


From b0d1beeff2a97a0cf1965ea8f1d13b8973f22582 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 28 Aug 2018 14:44:25 +0200
Subject: xdp: implement convert_to_xdp_frame for MEM_TYPE_ZERO_COPY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds proper MEM_TYPE_ZERO_COPY support for
convert_to_xdp_frame. Converting a MEM_TYPE_ZERO_COPY xdp_buff to an
xdp_frame is done by transforming the MEM_TYPE_ZERO_COPY buffer into a
MEM_TYPE_PAGE_ORDER0 frame. This is costly, and in the future it might
make sense to implement a more sophisticated thread-safe alloc/free
scheme for MEM_TYPE_ZERO_COPY, so that no allocation and copy is
required in the fast-path.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h |  5 +++--
 net/core/xdp.c    | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 76b95256c266..0d5c6fb4b2e2 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -91,6 +91,8 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame)
 	frame->dev_rx = NULL;
 }
 
+struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
+
 /* Convert xdp_buff to xdp_frame */
 static inline
 struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
@@ -99,9 +101,8 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	int metasize;
 	int headroom;
 
-	/* TODO: implement clone, copy, use "native" MEM_TYPE */
 	if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
-		return NULL;
+		return xdp_convert_zc_to_xdp_frame(xdp);
 
 	/* Assure headroom is available for storing info */
 	headroom = xdp->data - xdp->data_hard_start;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 89b6785cef2a..be6cb2f0e722 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -398,3 +398,42 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
 	info->flags = bpf->flags;
 }
 EXPORT_SYMBOL_GPL(xdp_attachment_setup);
+
+struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
+{
+	unsigned int metasize, headroom, totsize;
+	void *addr, *data_to_copy;
+	struct xdp_frame *xdpf;
+	struct page *page;
+
+	/* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */
+	metasize = xdp_data_meta_unsupported(xdp) ? 0 :
+		   xdp->data - xdp->data_meta;
+	headroom = xdp->data - xdp->data_hard_start;
+	totsize = xdp->data_end - xdp->data + metasize;
+
+	if (sizeof(*xdpf) + totsize > PAGE_SIZE)
+		return NULL;
+
+	page = dev_alloc_page();
+	if (!page)
+		return NULL;
+
+	addr = page_to_virt(page);
+	xdpf = addr;
+	memset(xdpf, 0, sizeof(*xdpf));
+
+	addr += sizeof(*xdpf);
+	data_to_copy = metasize ? xdp->data_meta : xdp->data;
+	memcpy(addr, data_to_copy, totsize);
+
+	xdpf->data = addr + metasize;
+	xdpf->len = totsize - metasize;
+	xdpf->headroom = 0;
+	xdpf->metasize = metasize;
+	xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+
+	xdp_return_buff(xdp);
+	return xdpf;
+}
+EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
-- 
cgit v1.2.3


From dce5bd6140a436e3348f6d13a1efb6e6c5a89acd Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 28 Aug 2018 14:44:26 +0200
Subject: xdp: export xdp_rxq_info_unreg_mem_model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export __xdp_rxq_info_unreg_mem_model as xdp_rxq_info_unreg_mem_model,
so it can be used from netdev drivers. Also, add additional checks for
the memory type.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h |  1 +
 net/core/xdp.c    | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0d5c6fb4b2e2..0f25b3675c5c 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -136,6 +136,7 @@ void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
 bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
 int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 			       enum xdp_mem_type type, void *allocator);
+void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq);
 
 /* Drivers not supporting XDP metadata can use this helper, which
  * rejects any room expansion for metadata as a result.
diff --git a/net/core/xdp.c b/net/core/xdp.c
index be6cb2f0e722..654dbb19707e 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -94,11 +94,21 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
 	kfree(xa);
 }
 
-static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 {
 	struct xdp_mem_allocator *xa;
 	int id = xdp_rxq->mem.id;
 
+	if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+		WARN(1, "Missing register, driver bug");
+		return;
+	}
+
+	if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL &&
+	    xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) {
+		return;
+	}
+
 	if (id == 0)
 		return;
 
@@ -110,6 +120,7 @@ static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 
 	mutex_unlock(&mem_id_lock);
 }
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
 
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 {
@@ -119,7 +130,7 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 
 	WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
 
-	__xdp_rxq_info_unreg_mem_model(xdp_rxq);
+	xdp_rxq_info_unreg_mem_model(xdp_rxq);
 
 	xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
 	xdp_rxq->dev = NULL;
-- 
cgit v1.2.3


From 902540342096af8a13351f6a22bfdd7a8e19ffd2 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 28 Aug 2018 14:44:27 +0200
Subject: xsk: expose xdp_umem_get_{data,dma} to drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the xdp_umem_get_{data,dma} functions to include/net/xdp_sock.h,
so that the upcoming zero-copy implementation in the Ethernet drivers
can utilize them.

Also, supply some dummy function implementations for
CONFIG_XDP_SOCKETS=n configs.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xdp_umem.h     | 10 ----------
 2 files changed, 43 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 7161856bcf9c..56994ad1ab40 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -79,6 +79,16 @@ void xsk_umem_discard_addr(struct xdp_umem *umem);
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
 bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
 void xsk_umem_consume_tx_done(struct xdp_umem *umem);
+
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
+{
+	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
+}
+
+static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
+{
+	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
+}
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
@@ -98,6 +108,39 @@ static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 {
 	return false;
 }
+
+static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
+{
+	return NULL;
+}
+
+static inline void xsk_umem_discard_addr(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+{
+}
+
+static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma,
+				       u32 *len)
+{
+	return false;
+}
+
+static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+{
+}
+
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
+{
+	return NULL;
+}
+
+static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
+{
+	return 0;
+}
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index f11560334f88..c8be1ad3eb88 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -8,16 +8,6 @@
 
 #include <net/xdp_sock.h>
 
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
-	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
-	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
-}
-
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 			u32 queue_id, u16 flags);
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
-- 
cgit v1.2.3


From 6c5c9581044dd6e0cd284ab653502fb9264f08b6 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Tue, 28 Aug 2018 14:44:28 +0200
Subject: net: add napi_if_scheduled_mark_missed

The function napi_if_scheduled_mark_missed is used to check if the
NAPI context is scheduled, if so set NAPIF_STATE_MISSED and return
true. Used by the AF_XDP zero-copy i40e Tx code implementation in
order to make sure that irq affinity is honored by the napi context.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/netdevice.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ca5ab98053c8..4271f6b4e419 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -535,6 +535,32 @@ static inline void napi_synchronize(const struct napi_struct *n)
 		barrier();
 }
 
+/**
+ *	napi_if_scheduled_mark_missed - if napi is running, set the
+ *	NAPIF_STATE_MISSED
+ *	@n: NAPI context
+ *
+ * If napi is running, set the NAPIF_STATE_MISSED, and return true if
+ * NAPI is scheduled.
+ **/
+static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
+{
+	unsigned long val, new;
+
+	do {
+		val = READ_ONCE(n->state);
+		if (val & NAPIF_STATE_DISABLE)
+			return true;
+
+		if (!(val & NAPIF_STATE_SCHED))
+			return false;
+
+		new = val | NAPIF_STATE_MISSED;
+	} while (cmpxchg(&n->state, val, new) != val);
+
+	return true;
+}
+
 enum netdev_queue_state_t {
 	__QUEUE_STATE_DRV_XOFF,
 	__QUEUE_STATE_STACK_XOFF,
-- 
cgit v1.2.3


From 679c782de14bd48c19dd74cd1af20a2bc05dd936 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Wed, 22 Aug 2018 20:02:19 +0100
Subject: bpf/verifier: per-register parent pointers

By giving each register its own liveness chain, we elide the skip_callee()
 logic.  Instead, each register's parent is the state it inherits from;
 both check_func_call() and prepare_func_exit() automatically connect
 reg states to the correct chain since when they copy the reg state across
 (r1-r5 into the callee as args, and r0 out as the return value) they also
 copy the parent pointer.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |   8 +-
 kernel/bpf/verifier.c        | 184 +++++++++++--------------------------------
 2 files changed, 47 insertions(+), 145 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 38b04f559ad3..b42b60a83e19 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -41,6 +41,7 @@ enum bpf_reg_liveness {
 };
 
 struct bpf_reg_state {
+	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
 	union {
 		/* valid when type == PTR_TO_PACKET */
@@ -59,7 +60,6 @@ struct bpf_reg_state {
 	 * came from, when one is tested for != NULL.
 	 */
 	u32 id;
-	/* Ordering of fields matters.  See states_equal() */
 	/* For scalar types (SCALAR_VALUE), this represents our knowledge of
 	 * the actual value.
 	 * For pointer types, this represents the variable part of the offset
@@ -76,15 +76,15 @@ struct bpf_reg_state {
 	s64 smax_value; /* maximum possible (s64)value */
 	u64 umin_value; /* minimum possible (u64)value */
 	u64 umax_value; /* maximum possible (u64)value */
+	/* parentage chain for liveness checking */
+	struct bpf_reg_state *parent;
 	/* Inside the callee two registers can be both PTR_TO_STACK like
 	 * R1=fp-8 and R2=fp-8, but one of them points to this function stack
 	 * while another to the caller's stack. To differentiate them 'frameno'
 	 * is used which is an index in bpf_verifier_state->frame[] array
 	 * pointing to bpf_func_state.
-	 * This field must be second to last, for states_equal() reasons.
 	 */
 	u32 frameno;
-	/* This field must be last, for states_equal() reasons. */
 	enum bpf_reg_liveness live;
 };
 
@@ -107,7 +107,6 @@ struct bpf_stack_state {
  */
 struct bpf_func_state {
 	struct bpf_reg_state regs[MAX_BPF_REG];
-	struct bpf_verifier_state *parent;
 	/* index of call instruction that called into this func */
 	int callsite;
 	/* stack frame number of this function state from pov of
@@ -129,7 +128,6 @@ struct bpf_func_state {
 struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
-	struct bpf_verifier_state *parent;
 	u32 curframe;
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 92246117d2b0..68568d22d6bd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -380,9 +380,9 @@ static int copy_stack_state(struct bpf_func_state *dst,
 /* do_check() starts with zero-sized stack in struct bpf_verifier_state to
  * make it consume minimal amount of memory. check_stack_write() access from
  * the program calls into realloc_func_state() to grow the stack size.
- * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
- * which this function copies over. It points to previous bpf_verifier_state
- * which is never reallocated
+ * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state
+ * which this function copies over. It points to corresponding reg in previous
+ * bpf_verifier_state which is never reallocated
  */
 static int realloc_func_state(struct bpf_func_state *state, int size,
 			      bool copy_old)
@@ -466,7 +466,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 		dst_state->frame[i] = NULL;
 	}
 	dst_state->curframe = src->curframe;
-	dst_state->parent = src->parent;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -732,6 +731,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
 	for (i = 0; i < MAX_BPF_REG; i++) {
 		mark_reg_not_init(env, regs, i);
 		regs[i].live = REG_LIVE_NONE;
+		regs[i].parent = NULL;
 	}
 
 	/* frame pointer */
@@ -876,74 +876,21 @@ next:
 	return 0;
 }
 
-static
-struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env,
-				       const struct bpf_verifier_state *state,
-				       struct bpf_verifier_state *parent,
-				       u32 regno)
-{
-	struct bpf_verifier_state *tmp = NULL;
-
-	/* 'parent' could be a state of caller and
-	 * 'state' could be a state of callee. In such case
-	 * parent->curframe < state->curframe
-	 * and it's ok for r1 - r5 registers
-	 *
-	 * 'parent' could be a callee's state after it bpf_exit-ed.
-	 * In such case parent->curframe > state->curframe
-	 * and it's ok for r0 only
-	 */
-	if (parent->curframe == state->curframe ||
-	    (parent->curframe < state->curframe &&
-	     regno >= BPF_REG_1 && regno <= BPF_REG_5) ||
-	    (parent->curframe > state->curframe &&
-	       regno == BPF_REG_0))
-		return parent;
-
-	if (parent->curframe > state->curframe &&
-	    regno >= BPF_REG_6) {
-		/* for callee saved regs we have to skip the whole chain
-		 * of states that belong to callee and mark as LIVE_READ
-		 * the registers before the call
-		 */
-		tmp = parent;
-		while (tmp && tmp->curframe != state->curframe) {
-			tmp = tmp->parent;
-		}
-		if (!tmp)
-			goto bug;
-		parent = tmp;
-	} else {
-		goto bug;
-	}
-	return parent;
-bug:
-	verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp);
-	verbose(env, "regno %d parent frame %d current frame %d\n",
-		regno, parent->curframe, state->curframe);
-	return NULL;
-}
-
+/* Parentage chain of this register (or stack slot) should take care of all
+ * issues like callee-saved registers, stack slot allocation time, etc.
+ */
 static int mark_reg_read(struct bpf_verifier_env *env,
-			 const struct bpf_verifier_state *state,
-			 struct bpf_verifier_state *parent,
-			 u32 regno)
+			 const struct bpf_reg_state *state,
+			 struct bpf_reg_state *parent)
 {
 	bool writes = parent == state->parent; /* Observe write marks */
 
-	if (regno == BPF_REG_FP)
-		/* We don't need to worry about FP liveness because it's read-only */
-		return 0;
-
 	while (parent) {
 		/* if read wasn't screened by an earlier write ... */
-		if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN)
+		if (writes && state->live & REG_LIVE_WRITTEN)
 			break;
-		parent = skip_callee(env, state, parent, regno);
-		if (!parent)
-			return -EFAULT;
 		/* ... then we depend on parent's value */
-		parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ;
+		parent->live |= REG_LIVE_READ;
 		state = parent;
 		parent = state->parent;
 		writes = true;
@@ -969,7 +916,10 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
-		return mark_reg_read(env, vstate, vstate->parent, regno);
+		/* We don't need to worry about FP liveness because it's read-only */
+		if (regno != BPF_REG_FP)
+			return mark_reg_read(env, &regs[regno],
+					     regs[regno].parent);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
@@ -1080,8 +1030,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
 	} else {
 		u8 type = STACK_MISC;
 
-		/* regular write of data into stack */
-		state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
+		/* regular write of data into stack destroys any spilled ptr */
+		state->stack[spi].spilled_ptr.type = NOT_INIT;
 
 		/* only mark the slot as written if all 8 bytes were written
 		 * otherwise read propagation may incorrectly stop too soon
@@ -1106,61 +1056,6 @@ static int check_stack_write(struct bpf_verifier_env *env,
 	return 0;
 }
 
-/* registers of every function are unique and mark_reg_read() propagates
- * the liveness in the following cases:
- * - from callee into caller for R1 - R5 that were used as arguments
- * - from caller into callee for R0 that used as result of the call
- * - from caller to the same caller skipping states of the callee for R6 - R9,
- *   since R6 - R9 are callee saved by implicit function prologue and
- *   caller's R6 != callee's R6, so when we propagate liveness up to
- *   parent states we need to skip callee states for R6 - R9.
- *
- * stack slot marking is different, since stacks of caller and callee are
- * accessible in both (since caller can pass a pointer to caller's stack to
- * callee which can pass it to another function), hence mark_stack_slot_read()
- * has to propagate the stack liveness to all parent states at given frame number.
- * Consider code:
- * f1() {
- *   ptr = fp - 8;
- *   *ptr = ctx;
- *   call f2 {
- *      .. = *ptr;
- *   }
- *   .. = *ptr;
- * }
- * First *ptr is reading from f1's stack and mark_stack_slot_read() has
- * to mark liveness at the f1's frame and not f2's frame.
- * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has
- * to propagate liveness to f2 states at f1's frame level and further into
- * f1 states at f1's frame level until write into that stack slot
- */
-static void mark_stack_slot_read(struct bpf_verifier_env *env,
-				 const struct bpf_verifier_state *state,
-				 struct bpf_verifier_state *parent,
-				 int slot, int frameno)
-{
-	bool writes = parent == state->parent; /* Observe write marks */
-
-	while (parent) {
-		if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE)
-			/* since LIVE_WRITTEN mark is only done for full 8-byte
-			 * write the read marks are conservative and parent
-			 * state may not even have the stack allocated. In such case
-			 * end the propagation, since the loop reached beginning
-			 * of the function
-			 */
-			break;
-		/* if read wasn't screened by an earlier write ... */
-		if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
-			break;
-		/* ... then we depend on parent's value */
-		parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
-		state = parent;
-		parent = state->parent;
-		writes = true;
-	}
-}
-
 static int check_stack_read(struct bpf_verifier_env *env,
 			    struct bpf_func_state *reg_state /* func where register points to */,
 			    int off, int size, int value_regno)
@@ -1198,8 +1093,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 			 */
 			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
 		}
-		mark_stack_slot_read(env, vstate, vstate->parent, spi,
-				     reg_state->frameno);
+		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
+			      reg_state->stack[spi].spilled_ptr.parent);
 		return 0;
 	} else {
 		int zeros = 0;
@@ -1215,8 +1110,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 				off, i, size);
 			return -EACCES;
 		}
-		mark_stack_slot_read(env, vstate, vstate->parent, spi,
-				     reg_state->frameno);
+		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
+			      reg_state->stack[spi].spilled_ptr.parent);
 		if (value_regno >= 0) {
 			if (zeros == size) {
 				/* any size read into register is zero extended,
@@ -1908,8 +1803,8 @@ mark:
 		/* reading any byte out of 8-byte 'spill_slot' will cause
 		 * the whole slot to be marked as 'read'
 		 */
-		mark_stack_slot_read(env, env->cur_state, env->cur_state->parent,
-				     spi, state->frameno);
+		mark_reg_read(env, &state->stack[spi].spilled_ptr,
+			      state->stack[spi].spilled_ptr.parent);
 	}
 	return update_stack_depth(env, state, off);
 }
@@ -2366,11 +2261,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			state->curframe + 1 /* frameno within this callchain */,
 			subprog /* subprog number within this prog */);
 
-	/* copy r1 - r5 args that callee can access */
+	/* copy r1 - r5 args that callee can access.  The copy includes parent
+	 * pointers, which connects us up to the liveness chain
+	 */
 	for (i = BPF_REG_1; i <= BPF_REG_5; i++)
 		callee->regs[i] = caller->regs[i];
 
-	/* after the call regsiters r0 - r5 were scratched */
+	/* after the call registers r0 - r5 were scratched */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, caller->regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
@@ -4370,7 +4267,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 		/* explored state didn't use this */
 		return true;
 
-	equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0;
+	equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
 
 	if (rold->type == PTR_TO_STACK)
 		/* two stack pointers are equal only if they're pointing to
@@ -4603,7 +4500,7 @@ static bool states_equal(struct bpf_verifier_env *env,
  * equivalent state (jump target or such) we didn't arrive by the straight-line
  * code, so read marks in the state must propagate to the parent regardless
  * of the state's write marks. That's what 'parent == state->parent' comparison
- * in mark_reg_read() and mark_stack_slot_read() is for.
+ * in mark_reg_read() is for.
  */
 static int propagate_liveness(struct bpf_verifier_env *env,
 			      const struct bpf_verifier_state *vstate,
@@ -4624,7 +4521,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 		if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
 			continue;
 		if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
-			err = mark_reg_read(env, vstate, vparent, i);
+			err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i],
+					    &vparent->frame[vstate->curframe]->regs[i]);
 			if (err)
 				return err;
 		}
@@ -4639,7 +4537,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 			if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
 				continue;
 			if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
-				mark_stack_slot_read(env, vstate, vparent, i, frame);
+				mark_reg_read(env, &state->stack[i].spilled_ptr,
+					      &parent->stack[i].spilled_ptr);
 		}
 	}
 	return err;
@@ -4649,7 +4548,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
-	struct bpf_verifier_state *cur = env->cur_state;
+	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err;
 
 	sl = env->explored_states[insn_idx];
@@ -4691,16 +4590,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		return -ENOMEM;
 
 	/* add new state to the head of linked list */
-	err = copy_verifier_state(&new_sl->state, cur);
+	new = &new_sl->state;
+	err = copy_verifier_state(new, cur);
 	if (err) {
-		free_verifier_state(&new_sl->state, false);
+		free_verifier_state(new, false);
 		kfree(new_sl);
 		return err;
 	}
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
 	/* connect new state to parentage chain */
-	cur->parent = &new_sl->state;
+	for (i = 0; i < BPF_REG_FP; i++)
+		cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i];
 	/* clear write marks in current state: the writes we did are not writes
 	 * our child did, so they don't screen off its reads from us.
 	 * (There are no read marks in current state, because reads always mark
@@ -4713,9 +4614,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	/* all stack frames are accessible from callee, clear them all */
 	for (j = 0; j <= cur->curframe; j++) {
 		struct bpf_func_state *frame = cur->frame[j];
+		struct bpf_func_state *newframe = new->frame[j];
 
-		for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++)
+		for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {
 			frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
+			frame->stack[i].spilled_ptr.parent =
+						&newframe->stack[i].spilled_ptr;
+		}
 	}
 	return 0;
 }
@@ -4734,7 +4639,6 @@ static int do_check(struct bpf_verifier_env *env)
 	if (!state)
 		return -ENOMEM;
 	state->curframe = 0;
-	state->parent = NULL;
 	state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
 	if (!state->frame[0]) {
 		kfree(state);
-- 
cgit v1.2.3


From 6fce10f70461c14079d5d44aa2b25c693f4d9221 Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Tue, 28 Aug 2018 18:51:58 +0200
Subject: genetlink: constify genl_err_attr() argument

genl_err_attr() sets netlink_ext_ack::bad_attr which is a pointer to const
struct nlattr so make the attr argument also const.

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index decf6012a401..aa2e5888f18d 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -112,7 +112,7 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net)
 #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg)
 
 static inline int genl_err_attr(struct genl_info *info, int err,
-				struct nlattr *attr)
+				const struct nlattr *attr)
 {
 	info->extack->bad_attr = attr;
 
-- 
cgit v1.2.3


From 9b3004953503462a4fab31b85e44ae446d48f0bd Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Tue, 28 Aug 2018 19:56:58 +0200
Subject: ethtool: drop get_settings and set_settings callbacks

Since [gs]et_settings ethtool_ops callbacks have been deprecated in
February 2016, all in tree NIC drivers have been converted to provide
[gs]et_link_ksettings() and out of tree drivers have had enough time to do
the same.

Drop get_settings() and set_settings() and implement both ETHTOOL_[GS]SET
and ETHTOOL_[GS]LINKSETTINGS only using [gs]et_link_ksettings().

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/ABI/testing/sysfs-class-net |   4 +-
 include/linux/ethtool.h                   |  33 ++-----
 include/uapi/linux/ethtool.h              |  15 +--
 net/core/ethtool.c                        | 158 +++++++-----------------------
 4 files changed, 50 insertions(+), 160 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
index 2f1788111cd9..e2e0fe553ad8 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -117,7 +117,7 @@ Description:
 		full: full duplex
 
 		Note: This attribute is only valid for interfaces that implement
-		the ethtool get_settings method (mostly Ethernet).
+		the ethtool get_link_ksettings method (mostly Ethernet).
 
 What:		/sys/class/net/<iface>/flags
 Date:		April 2005
@@ -224,7 +224,7 @@ Description:
 		an integer representing the link speed in Mbits/sec.
 
 		Note: this attribute is only valid for interfaces that implement
-		the ethtool get_settings method (mostly Ethernet ).
+		the ethtool get_link_ksettings method (mostly Ethernet).
 
 What:		/sys/class/net/<iface>/tx_queue_len
 Date:		April 2005
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index f8a2245b70ac..afd9596ce636 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -183,14 +183,6 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 
 /**
  * struct ethtool_ops - optional netdev operations
- * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
- *	API. Get various device settings including Ethernet link
- *	settings. The @cmd parameter is expected to have been cleared
- *	before get_settings is called. Returns a negative error code
- *	or zero.
- * @set_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
- *	API. Set various device settings including Ethernet link
- *	settings.  Returns a negative error code or zero.
  * @get_drvinfo: Report driver/device information.  Should only set the
  *	@driver, @version, @fw_version and @bus_info fields.  If not
  *	implemented, the @driver and @bus_info fields will be filled in
@@ -297,19 +289,16 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
  *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
  *	queue has this number, ignore the inapplicable fields.
  *	Returns a negative error code or zero.
- * @get_link_ksettings: When defined, takes precedence over the
- *	%get_settings method. Get various device settings
- *	including Ethernet link settings. The %cmd and
- *	%link_mode_masks_nwords fields should be ignored (use
- *	%__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), any
- *	change to them will be overwritten by kernel. Returns a
- *	negative error code or zero.
- * @set_link_ksettings: When defined, takes precedence over the
- *	%set_settings method. Set various device settings including
- *	Ethernet link settings. The %cmd and %link_mode_masks_nwords
- *	fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS
- *	instead of the latter), any change to them will be overwritten
- *	by kernel. Returns a negative error code or zero.
+ * @get_link_ksettings: Get various device settings including Ethernet link
+ *	settings. The %cmd and %link_mode_masks_nwords fields should be
+ *	ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter),
+ *	any change to them will be overwritten by kernel. Returns a negative
+ *	error code or zero.
+ * @set_link_ksettings: Set various device settings including Ethernet link
+ *	settings. The %cmd and %link_mode_masks_nwords fields should be
+ *	ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter),
+ *	any change to them will be overwritten by kernel. Returns a negative
+ *	error code or zero.
  * @get_fecparam: Get the network device Forward Error Correction parameters.
  * @set_fecparam: Set the network device Forward Error Correction parameters.
  * @get_ethtool_phy_stats: Return extended statistics about the PHY device.
@@ -329,8 +318,6 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
  * of the generic netdev features interface.
  */
 struct ethtool_ops {
-	int	(*get_settings)(struct net_device *, struct ethtool_cmd *);
-	int	(*set_settings)(struct net_device *, struct ethtool_cmd *);
 	void	(*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
 	int	(*get_regs_len)(struct net_device *);
 	void	(*get_regs)(struct net_device *, struct ethtool_regs *, void *);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index dc69391d2bba..c8f8e2455bf3 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -91,10 +91,6 @@
  * %ETHTOOL_GSET to get the current values before making specific
  * changes and then applying them with %ETHTOOL_SSET.
  *
- * Drivers that implement set_settings() should validate all fields
- * other than @cmd that are not described as read-only or deprecated,
- * and must ignore all fields described as read-only.
- *
  * Deprecated fields should be ignored by both users and drivers.
  */
 struct ethtool_cmd {
@@ -1800,14 +1796,9 @@ enum ethtool_reset_flags {
  * rejected.
  *
  * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt
- * are not available in %ethtool_link_settings. Until all drivers are
- * converted to ignore them or to the new %ethtool_link_settings API,
- * for both queries and changes, users should always try
- * %ETHTOOL_GLINKSETTINGS first, and if it fails with -ENOTSUPP stick
- * only to %ETHTOOL_GSET and %ETHTOOL_SSET consistently. If it
- * succeeds, then users should stick to %ETHTOOL_GLINKSETTINGS and
- * %ETHTOOL_SLINKSETTINGS (which would support drivers implementing
- * either %ethtool_cmd or %ethtool_link_settings).
+ * are not available in %ethtool_link_settings. These fields will be
+ * always set to zero in %ETHTOOL_GSET reply and %ETHTOOL_SSET will
+ * fail if any of them is set to non-zero value.
  *
  * Users should assume that all fields not marked read-only are
  * writable and subject to validation by the driver.  They should use
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index c9993c6c2fd4..9d4e56d97080 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -539,47 +539,17 @@ struct ethtool_link_usettings {
 	} link_modes;
 };
 
-/* Internal kernel helper to query a device ethtool_link_settings.
- *
- * Backward compatibility note: for compatibility with legacy drivers
- * that implement only the ethtool_cmd API, this has to work with both
- * drivers implementing get_link_ksettings API and drivers
- * implementing get_settings API. When drivers implement get_settings
- * and report ethtool_cmd deprecated fields
- * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
- * because the resulting struct ethtool_link_settings does not report them.
- */
+/* Internal kernel helper to query a device ethtool_link_settings. */
 int __ethtool_get_link_ksettings(struct net_device *dev,
 				 struct ethtool_link_ksettings *link_ksettings)
 {
-	int err;
-	struct ethtool_cmd cmd;
-
 	ASSERT_RTNL();
 
-	if (dev->ethtool_ops->get_link_ksettings) {
-		memset(link_ksettings, 0, sizeof(*link_ksettings));
-		return dev->ethtool_ops->get_link_ksettings(dev,
-							    link_ksettings);
-	}
-
-	/* driver doesn't support %ethtool_link_ksettings API. revert to
-	 * legacy %ethtool_cmd API, unless it's not supported either.
-	 * TODO: remove when ethtool_ops::get_settings disappears internally
-	 */
-	if (!dev->ethtool_ops->get_settings)
+	if (!dev->ethtool_ops->get_link_ksettings)
 		return -EOPNOTSUPP;
 
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.cmd = ETHTOOL_GSET;
-	err = dev->ethtool_ops->get_settings(dev, &cmd);
-	if (err < 0)
-		return err;
-
-	/* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
-	 */
-	convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
-	return err;
+	memset(link_ksettings, 0, sizeof(*link_ksettings));
+	return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings);
 }
 EXPORT_SYMBOL(__ethtool_get_link_ksettings);
 
@@ -635,16 +605,7 @@ store_link_ksettings_for_user(void __user *to,
 	return 0;
 }
 
-/* Query device for its ethtool_link_settings.
- *
- * Backward compatibility note: this function must fail when driver
- * does not implement ethtool::get_link_ksettings, even if legacy
- * ethtool_ops::get_settings is implemented. This tells new versions
- * of ethtool that they should use the legacy API %ETHTOOL_GSET for
- * this driver, so that they can correctly access the ethtool_cmd
- * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
- * implements ethtool_ops::get_settings anymore.
- */
+/* Query device for its ethtool_link_settings. */
 static int ethtool_get_link_ksettings(struct net_device *dev,
 				      void __user *useraddr)
 {
@@ -652,7 +613,6 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
 	struct ethtool_link_ksettings link_ksettings;
 
 	ASSERT_RTNL();
-
 	if (!dev->ethtool_ops->get_link_ksettings)
 		return -EOPNOTSUPP;
 
@@ -699,16 +659,7 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
 	return store_link_ksettings_for_user(useraddr, &link_ksettings);
 }
 
-/* Update device ethtool_link_settings.
- *
- * Backward compatibility note: this function must fail when driver
- * does not implement ethtool::set_link_ksettings, even if legacy
- * ethtool_ops::set_settings is implemented. This tells new versions
- * of ethtool that they should use the legacy API %ETHTOOL_SSET for
- * this driver, so that they can correctly update the ethtool_cmd
- * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
- * implements ethtool_ops::get_settings anymore.
- */
+/* Update device ethtool_link_settings. */
 static int ethtool_set_link_ksettings(struct net_device *dev,
 				      void __user *useraddr)
 {
@@ -746,51 +697,31 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
 
 /* Query device for its ethtool_cmd settings.
  *
- * Backward compatibility note: for compatibility with legacy ethtool,
- * this has to work with both drivers implementing get_link_ksettings
- * API and drivers implementing get_settings API. When drivers
- * implement get_link_ksettings and report higher link mode bits, a
- * kernel warning is logged once (with name of 1st driver/device) to
- * recommend user to upgrade ethtool, but the command is successful
- * (only the lower link mode bits reported back to user).
+ * Backward compatibility note: for compatibility with legacy ethtool, this is
+ * now implemented via get_link_ksettings. When driver reports higher link mode
+ * bits, a kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful (only the
+ * lower link mode bits reported back to user). Deprecated fields from
+ * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero.
  */
 static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 {
+	struct ethtool_link_ksettings link_ksettings;
 	struct ethtool_cmd cmd;
+	int err;
 
 	ASSERT_RTNL();
+	if (!dev->ethtool_ops->get_link_ksettings)
+		return -EOPNOTSUPP;
 
-	if (dev->ethtool_ops->get_link_ksettings) {
-		/* First, use link_ksettings API if it is supported */
-		int err;
-		struct ethtool_link_ksettings link_ksettings;
-
-		memset(&link_ksettings, 0, sizeof(link_ksettings));
-		err = dev->ethtool_ops->get_link_ksettings(dev,
-							   &link_ksettings);
-		if (err < 0)
-			return err;
-		convert_link_ksettings_to_legacy_settings(&cmd,
-							  &link_ksettings);
-
-		/* send a sensible cmd tag back to user */
-		cmd.cmd = ETHTOOL_GSET;
-	} else {
-		/* driver doesn't support %ethtool_link_ksettings
-		 * API. revert to legacy %ethtool_cmd API, unless it's
-		 * not supported either.
-		 */
-		int err;
-
-		if (!dev->ethtool_ops->get_settings)
-			return -EOPNOTSUPP;
+	memset(&link_ksettings, 0, sizeof(link_ksettings));
+	err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
+	if (err < 0)
+		return err;
+	convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings);
 
-		memset(&cmd, 0, sizeof(cmd));
-		cmd.cmd = ETHTOOL_GSET;
-		err = dev->ethtool_ops->get_settings(dev, &cmd);
-		if (err < 0)
-			return err;
-	}
+	/* send a sensible cmd tag back to user */
+	cmd.cmd = ETHTOOL_GSET;
 
 	if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
 		return -EFAULT;
@@ -800,48 +731,29 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 
 /* Update device link settings with given ethtool_cmd.
  *
- * Backward compatibility note: for compatibility with legacy ethtool,
- * this has to work with both drivers implementing set_link_ksettings
- * API and drivers implementing set_settings API. When drivers
- * implement set_link_ksettings and user's request updates deprecated
- * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
- * warning is logged once (with name of 1st driver/device) to
- * recommend user to upgrade ethtool, and the request is rejected.
+ * Backward compatibility note: for compatibility with legacy ethtool, this is
+ * now always implemented via set_link_settings. When user's request updates
+ * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to recommend user to
+ * upgrade ethtool, and the request is rejected.
  */
 static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
 {
+	struct ethtool_link_ksettings link_ksettings;
 	struct ethtool_cmd cmd;
 
 	ASSERT_RTNL();
 
 	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
 		return -EFAULT;
-
-	/* first, try new %ethtool_link_ksettings API. */
-	if (dev->ethtool_ops->set_link_ksettings) {
-		struct ethtool_link_ksettings link_ksettings;
-
-		if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
-							       &cmd))
-			return -EINVAL;
-
-		link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
-		link_ksettings.base.link_mode_masks_nwords
-			= __ETHTOOL_LINK_MODE_MASK_NU32;
-		return dev->ethtool_ops->set_link_ksettings(dev,
-							    &link_ksettings);
-	}
-
-	/* legacy %ethtool_cmd API */
-
-	/* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
-	 * disappears internally
-	 */
-
-	if (!dev->ethtool_ops->set_settings)
+	if (!dev->ethtool_ops->set_link_ksettings)
 		return -EOPNOTSUPP;
 
-	return dev->ethtool_ops->set_settings(dev, &cmd);
+	if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd))
+		return -EINVAL;
+	link_ksettings.base.link_mode_masks_nwords =
+		__ETHTOOL_LINK_MODE_MASK_NU32;
+	return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
 }
 
 static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
-- 
cgit v1.2.3


From a4e0109a19c554e44c832958e426303781e1ad30 Mon Sep 17 00:00:00 2001
From: Harshitha Ramamurthy <harshitha.ramamurthy@intel.com>
Date: Mon, 20 Aug 2018 08:12:32 -0700
Subject: virtchnl: use u8 type for a field in the virtchnl_filter struct

The virtchnl_filter struct has a field called field_flags. A previous
commit mistakenly had the type to be a __u8. What we want is for the
field to be an unsigned 8 bit value, so let's just use the existing
kernel type u8 for that.

Signed-off-by: Harshitha Ramamurthy <harshitha.ramamurthy@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/avf/virtchnl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 212b3822d180..b41f7bc958ef 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -573,7 +573,7 @@ struct virtchnl_filter {
 	enum	virtchnl_flow_type flow_type;
 	enum	virtchnl_action action;
 	u32	action_meta;
-	__u8	field_flags;
+	u8	field_flags;
 };
 
 VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter);
-- 
cgit v1.2.3


From 93ee30f3e8b412c5fc2d2f7d9d002529d9a209ad Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Fri, 31 Aug 2018 13:40:02 +0200
Subject: xsk: i40e: get rid of useless struct xdp_umem_props

This commit gets rid of the structure xdp_umem_props. It was there to
be able to break a dependency at one point, but this is no longer
needed. The values in the struct are instead stored directly in the
xdp_umem structure. This simplifies the xsk code as well as af_xdp
zero-copy drivers and as a bonus gets rid of one internal header file.

The i40e driver is also adapted to the new interface in this commit.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c |  4 ++--
 include/net/xdp_sock.h                     |  8 ++------
 net/xdp/xdp_umem.c                         |  4 ++--
 net/xdp/xdp_umem_props.h                   | 14 --------------
 net/xdp/xsk.c                              | 10 ++++++----
 net/xdp/xsk_queue.c                        |  5 +++--
 net/xdp/xsk_queue.h                        | 13 +++++++------
 7 files changed, 22 insertions(+), 36 deletions(-)
 delete mode 100644 net/xdp/xdp_umem_props.h

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 41ca7e1310bc..2ebfc78bbd09 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -442,7 +442,7 @@ static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
 				    struct i40e_rx_buffer *old_bi)
 {
 	struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc];
-	unsigned long mask = (unsigned long)rx_ring->xsk_umem->props.chunk_mask;
+	unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
 	u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
 	u16 nta = rx_ring->next_to_alloc;
 
@@ -477,7 +477,7 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
 
 	rx_ring = container_of(alloc, struct i40e_ring, zca);
 	hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
-	mask = rx_ring->xsk_umem->props.chunk_mask;
+	mask = rx_ring->xsk_umem->chunk_mask;
 
 	nta = rx_ring->next_to_alloc;
 	bi = &rx_ring->rx_bi[nta];
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 56994ad1ab40..932ca0dad6f3 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -16,11 +16,6 @@
 struct net_device;
 struct xsk_queue;
 
-struct xdp_umem_props {
-	u64 chunk_mask;
-	u64 size;
-};
-
 struct xdp_umem_page {
 	void *addr;
 	dma_addr_t dma;
@@ -30,7 +25,8 @@ struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
 	struct xdp_umem_page *pages;
-	struct xdp_umem_props props;
+	u64 chunk_mask;
+	u64 size;
 	u32 headroom;
 	u32 chunk_size_nohr;
 	struct user_struct *user;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index d179732617dc..b3b632c5aeae 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -312,8 +312,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 
 	umem->pid = get_task_pid(current, PIDTYPE_PID);
 	umem->address = (unsigned long)addr;
-	umem->props.chunk_mask = ~((u64)chunk_size - 1);
-	umem->props.size = size;
+	umem->chunk_mask = ~((u64)chunk_size - 1);
+	umem->size = size;
 	umem->headroom = headroom;
 	umem->chunk_size_nohr = chunk_size - headroom;
 	umem->npgs = size / PAGE_SIZE;
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
deleted file mode 100644
index 40eab10dfc49..000000000000
--- a/net/xdp/xdp_umem_props.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* XDP user-space packet buffer
- * Copyright(c) 2018 Intel Corporation.
- */
-
-#ifndef XDP_UMEM_PROPS_H_
-#define XDP_UMEM_PROPS_H_
-
-struct xdp_umem_props {
-	u64 chunk_mask;
-	u64 size;
-};
-
-#endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 569048e299df..5a432dfee4ee 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -470,8 +470,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		goto out_unlock;
 	} else {
 		/* This xsk has its own umem. */
-		xskq_set_umem(xs->umem->fq, &xs->umem->props);
-		xskq_set_umem(xs->umem->cq, &xs->umem->props);
+		xskq_set_umem(xs->umem->fq, xs->umem->size,
+			      xs->umem->chunk_mask);
+		xskq_set_umem(xs->umem->cq, xs->umem->size,
+			      xs->umem->chunk_mask);
 
 		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
 		if (err)
@@ -481,8 +483,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	xs->dev = dev;
 	xs->zc = xs->umem->zc;
 	xs->queue_id = qid;
-	xskq_set_umem(xs->rx, &xs->umem->props);
-	xskq_set_umem(xs->tx, &xs->umem->props);
+	xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
+	xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
 	xdp_add_sk_umem(xs->umem, xs);
 
 out_unlock:
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 6c32e92e98fc..2dc1384d9f27 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -7,12 +7,13 @@
 
 #include "xsk_queue.h"
 
-void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
+void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask)
 {
 	if (!q)
 		return;
 
-	q->umem_props = *umem_props;
+	q->size = size;
+	q->chunk_mask = chunk_mask;
 }
 
 static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 8a64b150be54..82252cccb4e0 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -31,7 +31,8 @@ struct xdp_umem_ring {
 };
 
 struct xsk_queue {
-	struct xdp_umem_props umem_props;
+	u64 chunk_mask;
+	u64 size;
 	u32 ring_mask;
 	u32 nentries;
 	u32 prod_head;
@@ -78,7 +79,7 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
 
 static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
 {
-	if (addr >= q->umem_props.size) {
+	if (addr >= q->size) {
 		q->invalid_descs++;
 		return false;
 	}
@@ -92,7 +93,7 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
 		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
-		*addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask;
+		*addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
 		if (xskq_is_valid_addr(q, *addr))
 			return addr;
 
@@ -173,8 +174,8 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
 	if (!xskq_is_valid_addr(q, d->addr))
 		return false;
 
-	if (((d->addr + d->len) & q->umem_props.chunk_mask) !=
-	    (d->addr & q->umem_props.chunk_mask)) {
+	if (((d->addr + d->len) & q->chunk_mask) !=
+	    (d->addr & q->chunk_mask)) {
 		q->invalid_descs++;
 		return false;
 	}
@@ -253,7 +254,7 @@ static inline bool xskq_empty_desc(struct xsk_queue *q)
 	return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries;
 }
 
-void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
+void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask);
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q_ops);
 
-- 
cgit v1.2.3


From f061b48c1787e6fece2190e27da6878f4f1796d0 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 29 Aug 2018 10:15:35 -0700
Subject: Revert "net: sched: act: add extack for lookup callback"

This reverts commit 331a9295de23 ("net: sched: act: add extack for lookup callback").

This extack is never used after 6 months... In fact, it can be just
set in the caller, right after ->lookup().

Cc: Alexander Aring <aring@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h      | 3 +--
 net/sched/act_api.c        | 6 ++++--
 net/sched/act_bpf.c        | 3 +--
 net/sched/act_connmark.c   | 3 +--
 net/sched/act_csum.c       | 3 +--
 net/sched/act_gact.c       | 3 +--
 net/sched/act_ife.c        | 3 +--
 net/sched/act_ipt.c        | 6 ++----
 net/sched/act_mirred.c     | 3 +--
 net/sched/act_nat.c        | 3 +--
 net/sched/act_pedit.c      | 3 +--
 net/sched/act_police.c     | 3 +--
 net/sched/act_sample.c     | 3 +--
 net/sched/act_simple.c     | 3 +--
 net/sched/act_skbedit.c    | 3 +--
 net/sched/act_skbmod.c     | 3 +--
 net/sched/act_tunnel_key.c | 3 +--
 net/sched/act_vlan.c       | 3 +--
 18 files changed, 22 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 970303448c90..c6f195b3c706 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -85,8 +85,7 @@ struct tc_action_ops {
 		       struct tcf_result *); /* called under RCU BH lock*/
 	int     (*dump)(struct sk_buff *, struct tc_action *, int, int);
 	void	(*cleanup)(struct tc_action *);
-	int     (*lookup)(struct net *net, struct tc_action **a, u32 index,
-			  struct netlink_ext_ack *extack);
+	int     (*lookup)(struct net *net, struct tc_action **a, u32 index);
 	int     (*init)(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **act, int ovr,
 			int bind, bool rtnl_held,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index db83dac1e7f4..398c752ff529 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1067,12 +1067,14 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
 	err = -EINVAL;
 	ops = tc_lookup_action(tb[TCA_ACT_KIND]);
 	if (!ops) { /* could happen in batch of actions */
-		NL_SET_ERR_MSG(extack, "Specified TC action not found");
+		NL_SET_ERR_MSG(extack, "Specified TC action kind not found");
 		goto err_out;
 	}
 	err = -ENOENT;
-	if (ops->lookup(net, &a, index, extack) == 0)
+	if (ops->lookup(net, &a, index) == 0) {
+		NL_SET_ERR_MSG(extack, "TC action with specified index not found");
 		goto err_mod;
+	}
 
 	module_put(ops->owner);
 	return a;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 0c68bc9cf0b4..c7633843e223 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -387,8 +387,7 @@ static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index,
-			  struct netlink_ext_ack *extack)
+static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 6f0f273f1139..e869c0ee63c8 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -190,8 +190,7 @@ static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index,
-			       struct netlink_ext_ack *extack)
+static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b8a67ae3105a..3dc25b7806d7 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -646,8 +646,7 @@ static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index,
-			   struct netlink_ext_ack *extack)
+static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
 
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index cd1d9bd32ef9..aa44d14b43c7 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -222,8 +222,7 @@ static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index,
-			   struct netlink_ext_ack *extack)
+static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
 
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 196430aefe87..19454146f60d 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -841,8 +841,7 @@ static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index,
-			  struct netlink_ext_ack *extack)
+static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 23273b5303fd..1efbfb10b1fc 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -329,8 +329,7 @@ static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index,
-			  struct netlink_ext_ack *extack)
+static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, ipt_net_id);
 
@@ -379,8 +378,7 @@ static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index,
-			 struct netlink_ext_ack *extack)
+static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, xt_net_id);
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8bf66d0a6800..a9d64bfe5a2a 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -338,8 +338,7 @@ static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index,
-			     struct netlink_ext_ack *extack)
+static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 4313aa102440..d98f33fdffe2 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -292,8 +292,7 @@ static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index,
-			  struct netlink_ext_ack *extack)
+static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 107034070019..6d6a9450e8ad 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -452,8 +452,7 @@ static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index,
-			    struct netlink_ext_ack *extack)
+static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 5d8bfa878477..393c7a670300 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -312,8 +312,7 @@ nla_put_failure:
 	return -1;
 }
 
-static int tcf_police_search(struct net *net, struct tc_action **a, u32 index,
-			     struct netlink_ext_ack *extack)
+static int tcf_police_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, police_net_id);
 
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 44e9c00657bc..83a133375d6d 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -224,8 +224,7 @@ static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index,
-			     struct netlink_ext_ack *extack)
+static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
 
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 52400d49f81f..902957beceb3 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -188,8 +188,7 @@ static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index,
-			   struct netlink_ext_ack *extack)
+static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
 
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 73e44ce2a883..b6263704ea57 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -291,8 +291,7 @@ static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index,
-			      struct netlink_ext_ack *extack)
+static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
 
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 588077fafd6c..59710a183bd3 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -251,8 +251,7 @@ static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index,
-			     struct netlink_ext_ack *extack)
+static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
 
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 420759153d5f..6d95b6919d9d 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -540,8 +540,7 @@ static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index,
-			     struct netlink_ext_ack *extack)
+static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
 
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 033d273afe50..ba677d54a7af 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -288,8 +288,7 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
-static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index,
-			   struct netlink_ext_ack *extack)
+static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 
-- 
cgit v1.2.3


From 94524d8fc965a7a0facdef6d1b01d5ef6d71a802 Mon Sep 17 00:00:00 2001
From: Vakul Garg <vakul.garg@nxp.com>
Date: Wed, 29 Aug 2018 15:26:55 +0530
Subject: net/tls: Add support for async decryption of tls records

When tls records are decrypted using asynchronous acclerators such as
NXP CAAM engine, the crypto apis return -EINPROGRESS. Presently, on
getting -EINPROGRESS, the tls record processing stops till the time the
crypto accelerator finishes off and returns the result. This incurs a
context switch and is not an efficient way of accessing the crypto
accelerators. Crypto accelerators work efficient when they are queued
with multiple crypto jobs without having to wait for the previous ones
to complete.

The patch submits multiple crypto requests without having to wait for
for previous ones to complete. This has been implemented for records
which are decrypted in zero-copy mode. At the end of recvmsg(), we wait
for all the asynchronous decryption requests to complete.

The references to records which have been sent for async decryption are
dropped. For cases where record decryption is not possible in zero-copy
mode, asynchronous decryption is not used and we wait for decryption
crypto api to complete.

For crypto requests executing in async fashion, the memory for
aead_request, sglists and skb etc is freed from the decryption
completion handler. The decryption completion handler wakesup the
sleeping user context when recvmsg() flags that it has done sending
all the decryption requests and there are no more decryption requests
pending to be completed.

Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
Reviewed-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h |   6 +++
 net/tls/tls_sw.c  | 134 ++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 127 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index d5c683e8bb22..cd0a65bd92f9 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -124,6 +124,12 @@ struct tls_sw_context_rx {
 	struct sk_buff *recv_pkt;
 	u8 control;
 	bool decrypted;
+	atomic_t decrypt_pending;
+	bool async_notify;
+};
+
+struct decrypt_req_ctx {
+	struct sock *sk;
 };
 
 struct tls_record_info {
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 4ba62cd00a94..be4f2e990f9f 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -119,12 +119,50 @@ static int skb_nsg(struct sk_buff *skb, int offset, int len)
         return __skb_nsg(skb, offset, len, 0);
 }
 
+static void tls_decrypt_done(struct crypto_async_request *req, int err)
+{
+	struct aead_request *aead_req = (struct aead_request *)req;
+	struct decrypt_req_ctx *req_ctx =
+			(struct decrypt_req_ctx *)(aead_req + 1);
+
+	struct scatterlist *sgout = aead_req->dst;
+
+	struct tls_context *tls_ctx = tls_get_ctx(req_ctx->sk);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+	int pending = atomic_dec_return(&ctx->decrypt_pending);
+	struct scatterlist *sg;
+	unsigned int pages;
+
+	/* Propagate if there was an err */
+	if (err) {
+		ctx->async_wait.err = err;
+		tls_err_abort(req_ctx->sk, err);
+	}
+
+	/* Release the skb, pages and memory allocated for crypto req */
+	kfree_skb(req->data);
+
+	/* Skip the first S/G entry as it points to AAD */
+	for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) {
+		if (!sg)
+			break;
+		put_page(sg_page(sg));
+	}
+
+	kfree(aead_req);
+
+	if (!pending && READ_ONCE(ctx->async_notify))
+		complete(&ctx->async_wait.completion);
+}
+
 static int tls_do_decryption(struct sock *sk,
+			     struct sk_buff *skb,
 			     struct scatterlist *sgin,
 			     struct scatterlist *sgout,
 			     char *iv_recv,
 			     size_t data_len,
-			     struct aead_request *aead_req)
+			     struct aead_request *aead_req,
+			     bool async)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
@@ -135,10 +173,34 @@ static int tls_do_decryption(struct sock *sk,
 	aead_request_set_crypt(aead_req, sgin, sgout,
 			       data_len + tls_ctx->rx.tag_size,
 			       (u8 *)iv_recv);
-	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
-				  crypto_req_done, &ctx->async_wait);
 
-	ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait);
+	if (async) {
+		struct decrypt_req_ctx *req_ctx;
+
+		req_ctx = (struct decrypt_req_ctx *)(aead_req + 1);
+		req_ctx->sk = sk;
+
+		aead_request_set_callback(aead_req,
+					  CRYPTO_TFM_REQ_MAY_BACKLOG,
+					  tls_decrypt_done, skb);
+		atomic_inc(&ctx->decrypt_pending);
+	} else {
+		aead_request_set_callback(aead_req,
+					  CRYPTO_TFM_REQ_MAY_BACKLOG,
+					  crypto_req_done, &ctx->async_wait);
+	}
+
+	ret = crypto_aead_decrypt(aead_req);
+	if (ret == -EINPROGRESS) {
+		if (async)
+			return ret;
+
+		ret = crypto_wait_req(ret, &ctx->async_wait);
+	}
+
+	if (async)
+		atomic_dec(&ctx->decrypt_pending);
+
 	return ret;
 }
 
@@ -841,7 +903,10 @@ fallback_to_reg_recv:
 	}
 
 	/* Prepare and submit AEAD request */
-	err = tls_do_decryption(sk, sgin, sgout, iv, data_len, aead_req);
+	err = tls_do_decryption(sk, skb, sgin, sgout, iv,
+				data_len, aead_req, *zc);
+	if (err == -EINPROGRESS)
+		return err;
 
 	/* Release the pages in case iov was mapped to pages */
 	for (; pages > 0; pages--)
@@ -866,8 +931,12 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 #endif
 	if (!ctx->decrypted) {
 		err = decrypt_internal(sk, skb, dest, NULL, chunk, zc);
-		if (err < 0)
+		if (err < 0) {
+			if (err == -EINPROGRESS)
+				tls_advance_record_sn(sk, &tls_ctx->rx);
+
 			return err;
+		}
 	} else {
 		*zc = false;
 	}
@@ -895,18 +964,20 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
-	struct strp_msg *rxm = strp_msg(skb);
 
-	if (len < rxm->full_len) {
-		rxm->offset += len;
-		rxm->full_len -= len;
+	if (skb) {
+		struct strp_msg *rxm = strp_msg(skb);
 
-		return false;
+		if (len < rxm->full_len) {
+			rxm->offset += len;
+			rxm->full_len -= len;
+			return false;
+		}
+		kfree_skb(skb);
 	}
 
 	/* Finished with message */
 	ctx->recv_pkt = NULL;
-	kfree_skb(skb);
 	__strp_unpause(&ctx->strp);
 
 	return true;
@@ -929,6 +1000,7 @@ int tls_sw_recvmsg(struct sock *sk,
 	int target, err = 0;
 	long timeo;
 	bool is_kvec = msg->msg_iter.type & ITER_KVEC;
+	int num_async = 0;
 
 	flags |= nonblock;
 
@@ -941,6 +1013,7 @@ int tls_sw_recvmsg(struct sock *sk,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 	do {
 		bool zc = false;
+		bool async = false;
 		int chunk = 0;
 
 		skb = tls_wait_data(sk, flags, timeo, &err);
@@ -948,6 +1021,7 @@ int tls_sw_recvmsg(struct sock *sk,
 			goto recv_end;
 
 		rxm = strp_msg(skb);
+
 		if (!cmsg) {
 			int cerr;
 
@@ -974,26 +1048,39 @@ int tls_sw_recvmsg(struct sock *sk,
 
 			err = decrypt_skb_update(sk, skb, &msg->msg_iter,
 						 &chunk, &zc);
-			if (err < 0) {
+			if (err < 0 && err != -EINPROGRESS) {
 				tls_err_abort(sk, EBADMSG);
 				goto recv_end;
 			}
+
+			if (err == -EINPROGRESS) {
+				async = true;
+				num_async++;
+				goto pick_next_record;
+			}
+
 			ctx->decrypted = true;
 		}
 
 		if (!zc) {
 			chunk = min_t(unsigned int, rxm->full_len, len);
+
 			err = skb_copy_datagram_msg(skb, rxm->offset, msg,
 						    chunk);
 			if (err < 0)
 				goto recv_end;
 		}
 
+pick_next_record:
 		copied += chunk;
 		len -= chunk;
 		if (likely(!(flags & MSG_PEEK))) {
 			u8 control = ctx->control;
 
+			/* For async, drop current skb reference */
+			if (async)
+				skb = NULL;
+
 			if (tls_sw_advance_skb(sk, skb, chunk)) {
 				/* Return full control message to
 				 * userspace before trying to parse
@@ -1002,14 +1089,33 @@ int tls_sw_recvmsg(struct sock *sk,
 				msg->msg_flags |= MSG_EOR;
 				if (control != TLS_RECORD_TYPE_DATA)
 					goto recv_end;
+			} else {
+				break;
 			}
 		}
+
 		/* If we have a new message from strparser, continue now. */
 		if (copied >= target && !ctx->recv_pkt)
 			break;
 	} while (len);
 
 recv_end:
+	if (num_async) {
+		/* Wait for all previously submitted records to be decrypted */
+		smp_store_mb(ctx->async_notify, true);
+		if (atomic_read(&ctx->decrypt_pending)) {
+			err = crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
+			if (err) {
+				/* one of async decrypt failed */
+				tls_err_abort(sk, err);
+				copied = 0;
+			}
+		} else {
+			reinit_completion(&ctx->async_wait.completion);
+		}
+		WRITE_ONCE(ctx->async_notify, false);
+	}
+
 	release_sock(sk);
 	return copied ? : err;
 }
@@ -1349,6 +1455,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		goto free_aead;
 
 	if (sw_ctx_rx) {
+		(*aead)->reqsize = sizeof(struct decrypt_req_ctx);
+
 		/* Set up strparser */
 		memset(&cb, 0, sizeof(cb));
 		cb.rcv_msg = tls_queue;
-- 
cgit v1.2.3


From b9de3963cc2b373a655636335cb8c4ed12fc9d3b Mon Sep 17 00:00:00 2001
From: Florent Fourcot <florent.fourcot@wifirst.fr>
Date: Thu, 30 Aug 2018 16:39:23 +0200
Subject: net/sched: fix type of htb statistics

tokens and ctokens are defined as s64 in htb_class structure,
and clamped to 32bits value during netlink dumps:

cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
                            INT_MIN, INT_MAX);

Defining it as u32 is working since userspace (tc) is printing it as
signed int, but a correct definition from the beginning is probably
better.

In the same time, 'giants' structure member is unused since years, so
update the comment to mark it unused.

Signed-off-by: Florent Fourcot <florent.fourcot@wifirst.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 8975fd1a1421..e9b7244ac381 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -395,9 +395,9 @@ enum {
 struct tc_htb_xstats {
 	__u32 lends;
 	__u32 borrows;
-	__u32 giants;	/* too big packets (rate will not be accurate) */
-	__u32 tokens;
-	__u32 ctokens;
+	__u32 giants;	/* unused since 'Make HTB scheduler work with TSO.' */
+	__s32 tokens;
+	__s32 ctokens;
 };
 
 /* HFSC section */
-- 
cgit v1.2.3


From aa7e80b220f3a543eefbe4b7e2c5d2b73e2e2ef7 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Mon, 3 Sep 2018 20:19:28 +0300
Subject: net/mlx5: Fix atomic_mode enum values

The field atomic_mode is 4 bits wide and therefore can hold values
from 0x0 to 0xf. Remove the unnecessary 20 bit shift that made the values
be incorrect. While that, remove unused enum values.

Fixes: 57cda166bbe0 ("net/mlx5: Add DCT command interface")
Signed-off-by: Moni Shoua <monis@mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/driver.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7a452716de4b..d885e9f0e054 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -163,10 +163,7 @@ enum mlx5_dcbx_oper_mode {
 };
 
 enum mlx5_dct_atomic_mode {
-	MLX5_ATOMIC_MODE_DCT_OFF        = 20,
-	MLX5_ATOMIC_MODE_DCT_NONE       = 0 << MLX5_ATOMIC_MODE_DCT_OFF,
-	MLX5_ATOMIC_MODE_DCT_IB_COMP    = 1 << MLX5_ATOMIC_MODE_DCT_OFF,
-	MLX5_ATOMIC_MODE_DCT_CX         = 2 << MLX5_ATOMIC_MODE_DCT_OFF,
+	MLX5_ATOMIC_MODE_DCT_CX         = 2,
 };
 
 enum {
-- 
cgit v1.2.3


From 8ce78257965e6cd49720e653867e766ecd38883f Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Tue, 28 Aug 2018 14:18:41 +0300
Subject: net/mlx5: Add proper NIC TX steering flow tables support

Extend the ability to add steering rules to NIC TX flow tables.
For now, we are only adding TX bypass (egress) which is used by the RDMA
side. This will allow to shape outgoing traffic and tweak it if needed, for
example performing encapsulation or rewriting headers.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 51 +++++++++++++++++------
 include/linux/mlx5/device.h                       |  6 +++
 3 files changed, 46 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 8e01f818021b..28c7301e08f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -760,8 +760,8 @@ const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default(enum fs_flow_table_type typ
 	case FS_FT_FDB:
 	case FS_FT_SNIFFER_RX:
 	case FS_FT_SNIFFER_TX:
-		return mlx5_fs_cmd_get_fw_cmds();
 	case FS_FT_NIC_TX:
+		return mlx5_fs_cmd_get_fw_cmds();
 	default:
 		return mlx5_fs_cmd_get_stub_cmds();
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 5624030d2ed4..b7e7eb3535c7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -76,6 +76,14 @@
 					   FS_CAP(flow_table_properties_nic_receive.identified_miss_table_mode), \
 					   FS_CAP(flow_table_properties_nic_receive.flow_table_modify))
 
+#define FS_CHAINING_CAPS_EGRESS                                                \
+	FS_REQUIRED_CAPS(                                                      \
+		FS_CAP(flow_table_properties_nic_transmit.flow_modify_en),     \
+		FS_CAP(flow_table_properties_nic_transmit.modify_root),        \
+		FS_CAP(flow_table_properties_nic_transmit                      \
+			       .identified_miss_table_mode),                   \
+		FS_CAP(flow_table_properties_nic_transmit.flow_table_modify))
+
 #define LEFTOVERS_NUM_LEVELS 1
 #define LEFTOVERS_NUM_PRIOS 1
 
@@ -151,6 +159,17 @@ static struct init_tree_node {
 	}
 };
 
+static struct init_tree_node egress_root_fs = {
+	.type = FS_TYPE_NAMESPACE,
+	.ar_size = 1,
+	.children = (struct init_tree_node[]) {
+		ADD_PRIO(0, MLX5_BY_PASS_NUM_PRIOS, 0,
+			 FS_CHAINING_CAPS_EGRESS,
+			 ADD_NS(ADD_MULTIPLE_PRIO(MLX5_BY_PASS_NUM_PRIOS,
+						  BY_PASS_PRIO_NUM_LEVELS))),
+	}
+};
+
 enum fs_i_lock_class {
 	FS_LOCK_GRANDPARENT,
 	FS_LOCK_PARENT,
@@ -1978,7 +1997,7 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 {
 	struct mlx5_flow_steering *steering = dev->priv.steering;
 	struct mlx5_flow_root_namespace *root_ns;
-	int prio;
+	int prio = 0;
 	struct fs_prio *fs_prio;
 	struct mlx5_flow_namespace *ns;
 
@@ -1998,16 +2017,17 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 		if (steering->sniffer_tx_root_ns)
 			return &steering->sniffer_tx_root_ns->ns;
 		return NULL;
-	case MLX5_FLOW_NAMESPACE_EGRESS:
-		if (steering->egress_root_ns)
-			return &steering->egress_root_ns->ns;
-		return NULL;
 	default:
 		break;
 	}
 
-	root_ns = steering->root_ns;
-	prio = type;
+	if (type == MLX5_FLOW_NAMESPACE_EGRESS) {
+		root_ns = steering->egress_root_ns;
+	} else { /* Must be NIC RX */
+		root_ns = steering->root_ns;
+		prio = type;
+	}
+
 	if (!root_ns)
 		return NULL;
 
@@ -2523,16 +2543,23 @@ cleanup_root_ns:
 
 static int init_egress_root_ns(struct mlx5_flow_steering *steering)
 {
-	struct fs_prio *prio;
+	int err;
 
 	steering->egress_root_ns = create_root_ns(steering,
 						  FS_FT_NIC_TX);
 	if (!steering->egress_root_ns)
 		return -ENOMEM;
 
-	/* create 1 prio*/
-	prio = fs_create_prio(&steering->egress_root_ns->ns, 0, 1);
-	return PTR_ERR_OR_ZERO(prio);
+	err = init_root_tree(steering, &egress_root_fs,
+			     &steering->egress_root_ns->ns.node);
+	if (err)
+		goto cleanup;
+	set_prio_attrs(steering->egress_root_ns);
+	return 0;
+cleanup:
+	cleanup_root_ns(steering->egress_root_ns);
+	steering->egress_root_ns = NULL;
+	return err;
 }
 
 int mlx5_init_fs(struct mlx5_core_dev *dev)
@@ -2600,7 +2627,7 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 			goto err;
 	}
 
-	if (MLX5_IPSEC_DEV(dev)) {
+	if (MLX5_IPSEC_DEV(dev) || MLX5_CAP_FLOWTABLE_NIC_TX(dev, ft_support)) {
 		err = init_egress_root_ns(steering);
 		if (err)
 			goto err;
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 11fa4e66afc5..f2281e69ab39 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1120,6 +1120,12 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_FLOWTABLE_NIC_RX_MAX(mdev, cap) \
 	MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_receive.cap)
 
+#define MLX5_CAP_FLOWTABLE_NIC_TX(mdev, cap) \
+		MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_transmit.cap)
+
+#define MLX5_CAP_FLOWTABLE_NIC_TX_MAX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_transmit.cap)
+
 #define MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) \
 	MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive_sniffer.cap)
 
-- 
cgit v1.2.3


From 90c1d1b8da67330b09893d749401a45328b51704 Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Tue, 28 Aug 2018 14:18:42 +0300
Subject: net/mlx5: Export modify header alloc/dealloc functions

Those functions will be used by the RDMA side to create modify header
actions to be attached to flow steering rules via verbs.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c    | 2 ++
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h | 5 -----
 include/linux/mlx5/fs.h                             | 6 ++++++
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 28c7301e08f4..37bea30b68ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -702,6 +702,7 @@ int mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
 	kfree(in);
 	return err;
 }
+EXPORT_SYMBOL(mlx5_modify_header_alloc);
 
 void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev, u32 modify_header_id)
 {
@@ -716,6 +717,7 @@ void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev, u32 modify_header_id)
 
 	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_modify_header_dealloc);
 
 static const struct mlx5_flow_cmds mlx5_flow_cmds = {
 	.create_flow_table = mlx5_cmd_create_flow_table,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index b4134fa0bba3..649d1bd83a1a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -176,11 +176,6 @@ int mlx5_encap_alloc(struct mlx5_core_dev *dev,
 		     u32 *encap_id);
 void mlx5_encap_dealloc(struct mlx5_core_dev *dev, u32 encap_id);
 
-int mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
-			     u8 namespace, u8 num_actions,
-			     void *modify_actions, u32 *modify_header_id);
-void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev, u32 modify_header_id);
-
 bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv);
 
 int mlx5_query_mtpps(struct mlx5_core_dev *dev, u32 *mtpps, u32 mtpps_size);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 804516e4f483..0cbf4d5cb1ab 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -196,4 +196,10 @@ int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
 int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
 int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
 
+int mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
+			     u8 namespace, u8 num_actions,
+			     void *modify_actions, u32 *modify_header_id);
+void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev,
+				u32 modify_header_id);
+
 #endif
-- 
cgit v1.2.3


From 61444b458b01c95e55003d6f0b4d4c936fde51cb Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Tue, 28 Aug 2018 14:18:44 +0300
Subject: net/mlx5: Break encap/decap into two separated flow table creation
 flags

Today we are able to attach encap and decap actions only to the FDB. In
preparation to enable those actions on the NIC flow tables, break the
single flag into two. Those flags control whatever a decap or encap
operations can be attached to the flow table created. For FDB, if
encapsulation is required, we set both of them.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c           | 7 ++++---
 include/linux/mlx5/fs.h                                    | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index f72b5c9dcfe9..ff21807a0c4b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -529,7 +529,8 @@ static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
 		esw_size >>= 1;
 
 	if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
-		flags |= MLX5_FLOW_TABLE_TUNNEL_EN;
+		flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_ENCAP |
+			  MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
 
 	fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
 						  esw_size,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 9ae777e56529..1698f325a21e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -152,7 +152,8 @@ static int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
 				      struct mlx5_flow_table *next_ft,
 				      unsigned int *table_id, u32 flags)
 {
-	int en_encap_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN);
+	int en_encap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN_ENCAP);
+	int en_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
 	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {0};
 	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)]   = {0};
 	int err;
@@ -169,9 +170,9 @@ static int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
 	}
 
 	MLX5_SET(create_flow_table_in, in, flow_table_context.decap_en,
-		 en_encap_decap);
+		 en_decap);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.encap_en,
-		 en_encap_decap);
+		 en_encap);
 
 	switch (op_mod) {
 	case FS_FT_OP_MOD_NORMAL:
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 0cbf4d5cb1ab..0194e62ad66a 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -45,7 +45,8 @@ enum {
 };
 
 enum {
-	MLX5_FLOW_TABLE_TUNNEL_EN = BIT(0),
+	MLX5_FLOW_TABLE_TUNNEL_EN_ENCAP = BIT(0),
+	MLX5_FLOW_TABLE_TUNNEL_EN_DECAP = BIT(1),
 };
 
 #define LEFTOVERS_RULE_NUM	 2
-- 
cgit v1.2.3


From e0e7a3861b6c6b673dc93e291ef11cf5e746b0c2 Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Tue, 28 Aug 2018 14:18:45 +0300
Subject: net/mlx5: Move header encap type to IFC header file

Those bits are hardware specification and should be defined in the
IFC header file.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 5 -----
 include/linux/mlx5/mlx5_ifc.h                   | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 9131a1376e7d..240a6fe1587e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -100,11 +100,6 @@ struct mlx5e_tc_flow_parse_attr {
 	int mirred_ifindex;
 };
 
-enum {
-	MLX5_HEADER_TYPE_VXLAN = 0x0,
-	MLX5_HEADER_TYPE_NVGRE = 0x1,
-};
-
 #define MLX5E_TC_TABLE_NUM_GROUPS 4
 #define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(16)
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f043d65b9bac..bd725e0924e5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4848,6 +4848,11 @@ struct mlx5_ifc_alloc_encap_header_out_bits {
 	u8         reserved_at_60[0x20];
 };
 
+enum {
+	MLX5_HEADER_TYPE_VXLAN = 0x0,
+	MLX5_HEADER_TYPE_NVGRE = 0x1,
+};
+
 struct mlx5_ifc_alloc_encap_header_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
-- 
cgit v1.2.3


From 60786f0987c0d9354e5330ee11615b16cdb448fe Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Tue, 28 Aug 2018 14:18:46 +0300
Subject: {net, RDMA}/mlx5: Rename encap to reformat packet

Renames all encap mlx5_{core,ib} code to use the new naming of packet
reformat. This change doesn't introduce any function change and is
needed to properly reflect the operation being done by this action.
For example not only can we encapsulate a packet, but also decapsulate it.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/devx.c                  |  6 +--
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |  8 +--
 .../mellanox/mlx5/core/diag/fs_tracepoint.h        |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 43 ++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  2 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  8 +--
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   | 63 ++++++++++++----------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    | 13 ++---
 include/linux/mlx5/fs.h                            |  4 +-
 include/linux/mlx5/mlx5_ifc.h                      | 50 ++++++++---------
 11 files changed, 107 insertions(+), 94 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index ac116d63e466..25dafa4ff6ca 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -284,7 +284,7 @@ static bool devx_is_obj_create_cmd(const void *in)
 	case MLX5_CMD_OP_CREATE_FLOW_TABLE:
 	case MLX5_CMD_OP_CREATE_FLOW_GROUP:
 	case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
-	case MLX5_CMD_OP_ALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT:
 	case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT:
 	case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
 	case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT:
@@ -627,9 +627,9 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
 		MLX5_SET(general_obj_in_cmd_hdr, din, opcode,
 			 MLX5_CMD_OP_DEALLOC_FLOW_COUNTER);
 		break;
-	case MLX5_CMD_OP_ALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT:
 		MLX5_SET(general_obj_in_cmd_hdr, din, opcode,
-			 MLX5_CMD_OP_DEALLOC_ENCAP_HEADER);
+			 MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT);
 		break;
 	case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT:
 		MLX5_SET(general_obj_in_cmd_hdr, din, opcode,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 6f589b4d33d9..39750fca371d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -308,7 +308,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_MODIFY_FLOW_TABLE:
 	case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
 	case MLX5_CMD_OP_SET_FLOW_TABLE_ROOT:
-	case MLX5_CMD_OP_DEALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT:
 	case MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT:
 	case MLX5_CMD_OP_FPGA_DESTROY_QP:
 	case MLX5_CMD_OP_DESTROY_GENERAL_OBJECT:
@@ -427,7 +427,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY:
 	case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
 	case MLX5_CMD_OP_QUERY_FLOW_COUNTER:
-	case MLX5_CMD_OP_ALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT:
 	case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT:
 	case MLX5_CMD_OP_FPGA_CREATE_QP:
 	case MLX5_CMD_OP_FPGA_MODIFY_QP:
@@ -601,8 +601,8 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(DEALLOC_FLOW_COUNTER);
 	MLX5_COMMAND_STR_CASE(QUERY_FLOW_COUNTER);
 	MLX5_COMMAND_STR_CASE(MODIFY_FLOW_TABLE);
-	MLX5_COMMAND_STR_CASE(ALLOC_ENCAP_HEADER);
-	MLX5_COMMAND_STR_CASE(DEALLOC_ENCAP_HEADER);
+	MLX5_COMMAND_STR_CASE(ALLOC_PACKET_REFORMAT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(DEALLOC_PACKET_REFORMAT_CONTEXT);
 	MLX5_COMMAND_STR_CASE(ALLOC_MODIFY_HEADER_CONTEXT);
 	MLX5_COMMAND_STR_CASE(DEALLOC_MODIFY_HEADER_CONTEXT);
 	MLX5_COMMAND_STR_CASE(FPGA_CREATE_QP);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
index 0240aee9189e..e83dda441a81 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
@@ -133,7 +133,7 @@ TRACE_EVENT(mlx5_fs_del_fg,
 	{MLX5_FLOW_CONTEXT_ACTION_DROP,		 "DROP"},\
 	{MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,	 "FWD"},\
 	{MLX5_FLOW_CONTEXT_ACTION_COUNT,	 "CNT"},\
-	{MLX5_FLOW_CONTEXT_ACTION_ENCAP,	 "ENCAP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT, "REFORMAT"},\
 	{MLX5_FLOW_CONTEXT_ACTION_DECAP,	 "DECAP"},\
 	{MLX5_FLOW_CONTEXT_ACTION_MOD_HDR,	 "MOD_HDR"},\
 	{MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH,	 "VLAN_PUSH"},\
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 240a6fe1587e..3df8f2b90908 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -681,7 +681,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 		.action = attr->action,
 		.has_flow_tag = true,
 		.flow_tag = attr->flow_tag,
-		.encap_id = 0,
+		.reformat_id = 0,
 	};
 	struct mlx5_fc *counter = NULL;
 	struct mlx5_flow_handle *rule;
@@ -829,7 +829,7 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 	struct mlx5e_priv *out_priv;
 	int err;
 
-	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) {
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) {
 		out_dev = __dev_get_by_index(dev_net(priv->netdev),
 					     attr->parse_attr->mirred_ifindex);
 		err = mlx5e_attach_encap(priv, &parse_attr->tun_info,
@@ -885,7 +885,7 @@ err_add_rule:
 err_mod_hdr:
 	mlx5_eswitch_del_vlan_action(esw, attr);
 err_add_vlan:
-	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
 		mlx5e_detach_encap(priv, flow);
 err_attach_encap:
 	return rule;
@@ -906,7 +906,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 
 	mlx5_eswitch_del_vlan_action(esw, attr);
 
-	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) {
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) {
 		mlx5e_detach_encap(priv, flow);
 		kvfree(attr->parse_attr);
 	}
@@ -923,9 +923,9 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 	struct mlx5e_tc_flow *flow;
 	int err;
 
-	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
-			       e->encap_size, e->encap_header,
-			       &e->encap_id);
+	err = mlx5_packet_reformat_alloc(priv->mdev, e->tunnel_type,
+					 e->encap_size, e->encap_header,
+					 &e->encap_id);
 	if (err) {
 		mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %d\n",
 			       err);
@@ -979,7 +979,7 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 
 	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
 		e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
-		mlx5_encap_dealloc(priv->mdev, e->encap_id);
+		mlx5_packet_reformat_dealloc(priv->mdev, e->encap_id);
 	}
 }
 
@@ -1048,7 +1048,7 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv,
 		mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
 
 		if (e->flags & MLX5_ENCAP_ENTRY_VALID)
-			mlx5_encap_dealloc(priv->mdev, e->encap_id);
+			mlx5_packet_reformat_dealloc(priv->mdev, e->encap_id);
 
 		hash_del_rcu(&e->encap_hlist);
 		kfree(e->encap_header);
@@ -2323,7 +2323,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 		return -ENOMEM;
 
 	switch (e->tunnel_type) {
-	case MLX5_HEADER_TYPE_VXLAN:
+	case MLX5_REFORMAT_TYPE_L2_TO_VXLAN:
 		fl4.flowi4_proto = IPPROTO_UDP;
 		fl4.fl4_dport = tun_key->tp_dst;
 		break;
@@ -2367,7 +2367,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 	read_unlock_bh(&n->lock);
 
 	switch (e->tunnel_type) {
-	case MLX5_HEADER_TYPE_VXLAN:
+	case MLX5_REFORMAT_TYPE_L2_TO_VXLAN:
 		gen_vxlan_header_ipv4(out_dev, encap_header,
 				      ipv4_encap_size, e->h_dest, tos, ttl,
 				      fl4.daddr,
@@ -2387,8 +2387,9 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 		goto out;
 	}
 
-	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
-			       ipv4_encap_size, encap_header, &e->encap_id);
+	err = mlx5_packet_reformat_alloc(priv->mdev, e->tunnel_type,
+					 ipv4_encap_size, encap_header,
+					 &e->encap_id);
 	if (err)
 		goto destroy_neigh_entry;
 
@@ -2432,7 +2433,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 		return -ENOMEM;
 
 	switch (e->tunnel_type) {
-	case MLX5_HEADER_TYPE_VXLAN:
+	case MLX5_REFORMAT_TYPE_L2_TO_VXLAN:
 		fl6.flowi6_proto = IPPROTO_UDP;
 		fl6.fl6_dport = tun_key->tp_dst;
 		break;
@@ -2476,7 +2477,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 	read_unlock_bh(&n->lock);
 
 	switch (e->tunnel_type) {
-	case MLX5_HEADER_TYPE_VXLAN:
+	case MLX5_REFORMAT_TYPE_L2_TO_VXLAN:
 		gen_vxlan_header_ipv6(out_dev, encap_header,
 				      ipv6_encap_size, e->h_dest, tos, ttl,
 				      &fl6.daddr,
@@ -2497,8 +2498,9 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 		goto out;
 	}
 
-	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
-			       ipv6_encap_size, encap_header, &e->encap_id);
+	err = mlx5_packet_reformat_alloc(priv->mdev, e->tunnel_type,
+					 ipv6_encap_size, encap_header,
+					 &e->encap_id);
 	if (err)
 		goto destroy_neigh_entry;
 
@@ -2546,7 +2548,7 @@ vxlan_encap_offload_err:
 
 	if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, be16_to_cpu(key->tp_dst)) &&
 	    MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) {
-		tunnel_type = MLX5_HEADER_TYPE_VXLAN;
+		tunnel_type = MLX5_REFORMAT_TYPE_L2_TO_VXLAN;
 	} else {
 		netdev_warn(priv->netdev,
 			    "%d isn't an offloaded vxlan udp dport\n", be16_to_cpu(key->tp_dst));
@@ -2721,7 +2723,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				parse_attr->mirred_ifindex = out_dev->ifindex;
 				parse_attr->tun_info = *info;
 				attr->parse_attr = parse_attr;
-				action |= MLX5_FLOW_CONTEXT_ACTION_ENCAP |
+				action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT |
 					  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
 					  MLX5_FLOW_CONTEXT_ACTION_COUNT;
 				/* attr->out_rep is resolved when we handle encap */
@@ -2867,7 +2869,8 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv,
 		flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
 
 	if (!(flow->flags & MLX5E_TC_FLOW_ESWITCH) ||
-	    !(flow->esw_attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP))
+	    !(flow->esw_attr->action &
+	      MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT))
 		kvfree(parse_attr);
 
 	err = rhashtable_insert_fast(tc_ht, &flow->node, tc_ht_params);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 2b252cde5cc2..525b7e43b298 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1746,7 +1746,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	esw->enabled_vports = 0;
 	esw->mode = SRIOV_NONE;
 	esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
-	if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, encap) &&
+	if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, reformat) &&
 	    MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap))
 		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC;
 	else
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index ff21807a0c4b..00ec6dd72080 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -127,8 +127,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
 		flow_act.modify_id = attr->mod_hdr_id;
 
-	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
-		flow_act.encap_id = attr->encap_id;
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
+		flow_act.reformat_id = attr->encap_id;
 
 	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, i);
 	if (IS_ERR(rule))
@@ -529,7 +529,7 @@ static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
 		esw_size >>= 1;
 
 	if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
-		flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_ENCAP |
+		flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
 			  MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
 
 	fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
@@ -1245,7 +1245,7 @@ int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap)
 		return err;
 
 	if (encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE &&
-	    (!MLX5_CAP_ESW_FLOWTABLE_FDB(dev, encap) ||
+	    (!MLX5_CAP_ESW_FLOWTABLE_FDB(dev, reformat) ||
 	     !MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap)))
 		return -EOPNOTSUPP;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 1698f325a21e..4539b709db20 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -152,7 +152,7 @@ static int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
 				      struct mlx5_flow_table *next_ft,
 				      unsigned int *table_id, u32 flags)
 {
-	int en_encap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN_ENCAP);
+	int en_encap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT);
 	int en_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
 	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {0};
 	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)]   = {0};
@@ -171,7 +171,7 @@ static int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
 
 	MLX5_SET(create_flow_table_in, in, flow_table_context.decap_en,
 		 en_decap);
-	MLX5_SET(create_flow_table_in, in, flow_table_context.encap_en,
+	MLX5_SET(create_flow_table_in, in, flow_table_context.reformat_en,
 		 en_encap);
 
 	switch (op_mod) {
@@ -344,7 +344,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 
 	MLX5_SET(flow_context, in_flow_context, flow_tag, fte->action.flow_tag);
 	MLX5_SET(flow_context, in_flow_context, action, fte->action.action);
-	MLX5_SET(flow_context, in_flow_context, encap_id, fte->action.encap_id);
+	MLX5_SET(flow_context, in_flow_context, packet_reformat_id,
+		 fte->action.reformat_id);
 	MLX5_SET(flow_context, in_flow_context, modify_header_id,
 		 fte->action.modify_id);
 
@@ -595,16 +596,16 @@ void mlx5_cmd_fc_bulk_get(struct mlx5_core_dev *dev,
 	*bytes = MLX5_GET64(traffic_counter, stats, octets);
 }
 
-int mlx5_encap_alloc(struct mlx5_core_dev *dev,
-		     int header_type,
-		     size_t size,
-		     void *encap_header,
-		     u32 *encap_id)
+int mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
+			       int reformat_type,
+			       size_t size,
+			       void *reformat_data,
+			       u32 *packet_reformat_id)
 {
 	int max_encap_size = MLX5_CAP_ESW(dev, max_encap_header_size);
-	u32 out[MLX5_ST_SZ_DW(alloc_encap_header_out)];
-	void *encap_header_in;
-	void *header;
+	u32 out[MLX5_ST_SZ_DW(alloc_packet_reformat_context_out)];
+	void *packet_reformat_context_in;
+	void *reformat;
 	int inlen;
 	int err;
 	u32 *in;
@@ -615,39 +616,47 @@ int mlx5_encap_alloc(struct mlx5_core_dev *dev,
 		return -EINVAL;
 	}
 
-	in = kzalloc(MLX5_ST_SZ_BYTES(alloc_encap_header_in) + size,
+	in = kzalloc(MLX5_ST_SZ_BYTES(alloc_packet_reformat_context_in) + size,
 		     GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
 
-	encap_header_in = MLX5_ADDR_OF(alloc_encap_header_in, in, encap_header);
-	header = MLX5_ADDR_OF(encap_header_in, encap_header_in, encap_header);
-	inlen = header - (void *)in  + size;
+	packet_reformat_context_in = MLX5_ADDR_OF(alloc_packet_reformat_context_in,
+						  in, packet_reformat_context);
+	reformat = MLX5_ADDR_OF(packet_reformat_context_in,
+				packet_reformat_context_in,
+				reformat_data);
+	inlen = reformat - (void *)in  + size;
 
 	memset(in, 0, inlen);
-	MLX5_SET(alloc_encap_header_in, in, opcode,
-		 MLX5_CMD_OP_ALLOC_ENCAP_HEADER);
-	MLX5_SET(encap_header_in, encap_header_in, encap_header_size, size);
-	MLX5_SET(encap_header_in, encap_header_in, header_type, header_type);
-	memcpy(header, encap_header, size);
+	MLX5_SET(alloc_packet_reformat_context_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT);
+	MLX5_SET(packet_reformat_context_in, packet_reformat_context_in,
+		 reformat_data_size, size);
+	MLX5_SET(packet_reformat_context_in, packet_reformat_context_in,
+		 reformat_type, reformat_type);
+	memcpy(reformat, reformat_data, size);
 
 	memset(out, 0, sizeof(out));
 	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
 
-	*encap_id = MLX5_GET(alloc_encap_header_out, out, encap_id);
+	*packet_reformat_id = MLX5_GET(alloc_packet_reformat_context_out,
+				       out, packet_reformat_id);
 	kfree(in);
 	return err;
 }
 
-void mlx5_encap_dealloc(struct mlx5_core_dev *dev, u32 encap_id)
+void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
+				  u32 packet_reformat_id)
 {
-	u32 in[MLX5_ST_SZ_DW(dealloc_encap_header_in)];
-	u32 out[MLX5_ST_SZ_DW(dealloc_encap_header_out)];
+	u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_in)];
+	u32 out[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_out)];
 
 	memset(in, 0, sizeof(in));
-	MLX5_SET(dealloc_encap_header_in, in, opcode,
-		 MLX5_CMD_OP_DEALLOC_ENCAP_HEADER);
-	MLX5_SET(dealloc_encap_header_in, in, encap_id, encap_id);
+	MLX5_SET(dealloc_packet_reformat_context_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT);
+	MLX5_SET(dealloc_packet_reformat_context_in, in, packet_reformat_id,
+		 packet_reformat_id);
 
 	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index b7e7eb3535c7..d2b162cfe86b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1407,7 +1407,7 @@ static bool check_conflicting_actions(u32 action1, u32 action2)
 		return false;
 
 	if (xored_actions & (MLX5_FLOW_CONTEXT_ACTION_DROP  |
-			     MLX5_FLOW_CONTEXT_ACTION_ENCAP |
+			     MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT |
 			     MLX5_FLOW_CONTEXT_ACTION_DECAP |
 			     MLX5_FLOW_CONTEXT_ACTION_MOD_HDR  |
 			     MLX5_FLOW_CONTEXT_ACTION_VLAN_POP |
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 649d1bd83a1a..f3c8f51cc9c2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -169,12 +169,13 @@ struct mlx5_core_dev *mlx5_get_next_phys_dev(struct mlx5_core_dev *dev);
 void mlx5_dev_list_lock(void);
 void mlx5_dev_list_unlock(void);
 int mlx5_dev_list_trylock(void);
-int mlx5_encap_alloc(struct mlx5_core_dev *dev,
-		     int header_type,
-		     size_t size,
-		     void *encap_header,
-		     u32 *encap_id);
-void mlx5_encap_dealloc(struct mlx5_core_dev *dev, u32 encap_id);
+int mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
+			       int reformat_type,
+			       size_t size,
+			       void *reformat_data,
+			       u32 *packet_reformat_id);
+void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
+				  u32 packet_reformat_id);
 
 bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv);
 
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 0194e62ad66a..37d0c08d0966 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -45,7 +45,7 @@ enum {
 };
 
 enum {
-	MLX5_FLOW_TABLE_TUNNEL_EN_ENCAP = BIT(0),
+	MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT = BIT(0),
 	MLX5_FLOW_TABLE_TUNNEL_EN_DECAP = BIT(1),
 };
 
@@ -160,7 +160,7 @@ struct mlx5_flow_act {
 	u32 action;
 	bool has_flow_tag;
 	u32 flow_tag;
-	u32 encap_id;
+	u32 reformat_id;
 	u32 modify_id;
 	uintptr_t esp_id;
 	struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH];
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index bd725e0924e5..c79eaae28e59 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -243,8 +243,8 @@ enum {
 	MLX5_CMD_OP_DEALLOC_FLOW_COUNTER          = 0x93a,
 	MLX5_CMD_OP_QUERY_FLOW_COUNTER            = 0x93b,
 	MLX5_CMD_OP_MODIFY_FLOW_TABLE             = 0x93c,
-	MLX5_CMD_OP_ALLOC_ENCAP_HEADER            = 0x93d,
-	MLX5_CMD_OP_DEALLOC_ENCAP_HEADER          = 0x93e,
+	MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT = 0x93d,
+	MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT = 0x93e,
 	MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT   = 0x940,
 	MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT = 0x941,
 	MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT   = 0x942,
@@ -336,7 +336,7 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         modify_root[0x1];
 	u8         identified_miss_table_mode[0x1];
 	u8         flow_table_modify[0x1];
-	u8         encap[0x1];
+	u8         reformat[0x1];
 	u8         decap[0x1];
 	u8         reserved_at_9[0x1];
 	u8         pop_vlan[0x1];
@@ -599,7 +599,7 @@ struct mlx5_ifc_e_switch_cap_bits {
 	u8         vxlan_encap_decap[0x1];
 	u8         nvgre_encap_decap[0x1];
 	u8         reserved_at_22[0x9];
-	u8         log_max_encap_headers[0x5];
+	u8         log_max_packet_reformat_context[0x5];
 	u8         reserved_2b[0x6];
 	u8         max_encap_header_size[0xa];
 
@@ -2394,7 +2394,7 @@ enum {
 	MLX5_FLOW_CONTEXT_ACTION_DROP      = 0x2,
 	MLX5_FLOW_CONTEXT_ACTION_FWD_DEST  = 0x4,
 	MLX5_FLOW_CONTEXT_ACTION_COUNT     = 0x8,
-	MLX5_FLOW_CONTEXT_ACTION_ENCAP     = 0x10,
+	MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT = 0x10,
 	MLX5_FLOW_CONTEXT_ACTION_DECAP     = 0x20,
 	MLX5_FLOW_CONTEXT_ACTION_MOD_HDR   = 0x40,
 	MLX5_FLOW_CONTEXT_ACTION_VLAN_POP  = 0x80,
@@ -2427,7 +2427,7 @@ struct mlx5_ifc_flow_context_bits {
 	u8         reserved_at_a0[0x8];
 	u8         flow_counter_list_size[0x18];
 
-	u8         encap_id[0x20];
+	u8         packet_reformat_id[0x20];
 
 	u8         modify_header_id[0x20];
 
@@ -4802,19 +4802,19 @@ struct mlx5_ifc_query_eq_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
-struct mlx5_ifc_encap_header_in_bits {
+struct mlx5_ifc_packet_reformat_context_in_bits {
 	u8         reserved_at_0[0x5];
-	u8         header_type[0x3];
+	u8         reformat_type[0x3];
 	u8         reserved_at_8[0xe];
-	u8         encap_header_size[0xa];
+	u8         reformat_data_size[0xa];
 
 	u8         reserved_at_20[0x10];
-	u8         encap_header[2][0x8];
+	u8         reformat_data[2][0x8];
 
-	u8         more_encap_header[0][0x8];
+	u8         more_reformat_data[0][0x8];
 };
 
-struct mlx5_ifc_query_encap_header_out_bits {
+struct mlx5_ifc_query_packet_reformat_context_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
 
@@ -4822,38 +4822,38 @@ struct mlx5_ifc_query_encap_header_out_bits {
 
 	u8         reserved_at_40[0xa0];
 
-	struct mlx5_ifc_encap_header_in_bits encap_header[0];
+	struct mlx5_ifc_packet_reformat_context_in_bits packet_reformat_context[0];
 };
 
-struct mlx5_ifc_query_encap_header_in_bits {
+struct mlx5_ifc_query_packet_reformat_context_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         encap_id[0x20];
+	u8         packet_reformat_id[0x20];
 
 	u8         reserved_at_60[0xa0];
 };
 
-struct mlx5_ifc_alloc_encap_header_out_bits {
+struct mlx5_ifc_alloc_packet_reformat_context_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
 
 	u8         syndrome[0x20];
 
-	u8         encap_id[0x20];
+	u8         packet_reformat_id[0x20];
 
 	u8         reserved_at_60[0x20];
 };
 
 enum {
-	MLX5_HEADER_TYPE_VXLAN = 0x0,
-	MLX5_HEADER_TYPE_NVGRE = 0x1,
+	MLX5_REFORMAT_TYPE_L2_TO_VXLAN = 0x0,
+	MLX5_REFORMAT_TYPE_L2_TO_NVGRE = 0x1,
 };
 
-struct mlx5_ifc_alloc_encap_header_in_bits {
+struct mlx5_ifc_alloc_packet_reformat_context_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
 
@@ -4862,10 +4862,10 @@ struct mlx5_ifc_alloc_encap_header_in_bits {
 
 	u8         reserved_at_40[0xa0];
 
-	struct mlx5_ifc_encap_header_in_bits encap_header;
+	struct mlx5_ifc_packet_reformat_context_in_bits packet_reformat_context;
 };
 
-struct mlx5_ifc_dealloc_encap_header_out_bits {
+struct mlx5_ifc_dealloc_packet_reformat_context_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
 
@@ -4874,14 +4874,14 @@ struct mlx5_ifc_dealloc_encap_header_out_bits {
 	u8         reserved_at_40[0x40];
 };
 
-struct mlx5_ifc_dealloc_encap_header_in_bits {
+struct mlx5_ifc_dealloc_packet_reformat_context_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
 
 	u8         reserved_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         encap_id[0x20];
+	u8         packet_reformat_id[0x20];
 
 	u8         reserved_60[0x20];
 };
@@ -6983,7 +6983,7 @@ struct mlx5_ifc_create_flow_table_out_bits {
 };
 
 struct mlx5_ifc_flow_table_context_bits {
-	u8         encap_en[0x1];
+	u8         reformat_en[0x1];
 	u8         decap_en[0x1];
 	u8         reserved_at_2[0x2];
 	u8         table_miss_action[0x4];
-- 
cgit v1.2.3


From bea4e1f6c6c5744d467ebf8b0699f5e391835130 Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Tue, 28 Aug 2018 14:18:47 +0300
Subject: net/mlx5: Expose new packet reformat capabilities

Expose new abilities when creating a packet reformat context.

The new types which can be created are:
MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL: Ability to create generic encap
operation to be done by the HW.

MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2: Ability to create generic decap
operation where the inner packet doesn't contain L2.

MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL: Ability to create generic encap
operation to be done by the HW. The L2 of the original packet
is dropped.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c79eaae28e59..3a4a2e0567e9 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -344,8 +344,12 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         reserved_at_c[0x1];
 	u8         pop_vlan_2[0x1];
 	u8         push_vlan_2[0x1];
-	u8         reserved_at_f[0x11];
-
+	u8	   reformat_and_vlan_action[0x1];
+	u8	   reserved_at_10[0x2];
+	u8	   reformat_l3_tunnel_to_l2[0x1];
+	u8	   reformat_l2_to_l3_tunnel[0x1];
+	u8	   reformat_and_modify_action[0x1];
+	u8         reserved_at_14[0xb];
 	u8         reserved_at_20[0x2];
 	u8         log_max_ft_size[0x6];
 	u8         log_max_modify_header_context[0x8];
@@ -554,7 +558,13 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 	u8         nic_rx_multi_path_tirs[0x1];
 	u8         nic_rx_multi_path_tirs_fts[0x1];
 	u8         allow_sniffer_and_nic_rx_shared_tir[0x1];
-	u8         reserved_at_3[0x1fd];
+	u8	   reserved_at_3[0x1d];
+	u8	   encap_general_header[0x1];
+	u8	   reserved_at_21[0xa];
+	u8	   log_max_packet_reformat_context[0x5];
+	u8	   reserved_at_30[0x6];
+	u8	   max_encap_header_size[0xa];
+	u8	   reserved_at_40[0x1c0];
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive;
 
@@ -4851,6 +4861,9 @@ struct mlx5_ifc_alloc_packet_reformat_context_out_bits {
 enum {
 	MLX5_REFORMAT_TYPE_L2_TO_VXLAN = 0x0,
 	MLX5_REFORMAT_TYPE_L2_TO_NVGRE = 0x1,
+	MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x2,
+	MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3,
+	MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4,
 };
 
 struct mlx5_ifc_alloc_packet_reformat_context_in_bits {
-- 
cgit v1.2.3


From 50acec06f3928fc29647aecf1270e54cae583afb Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Tue, 28 Aug 2018 14:18:49 +0300
Subject: net/mlx5: Export packet reformat alloc/dealloc functions

This will allow for the RDMA side to allocate packet reformat context.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c    | 2 ++
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h | 8 --------
 include/linux/mlx5/fs.h                             | 9 +++++++++
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index cc9537891e39..dc8d7f6b52c2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -651,6 +651,7 @@ int mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
 	kfree(in);
 	return err;
 }
+EXPORT_SYMBOL(mlx5_packet_reformat_alloc);
 
 void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
 				  u32 packet_reformat_id)
@@ -666,6 +667,7 @@ void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
 
 	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_packet_reformat_dealloc);
 
 int mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
 			     u8 namespace, u8 num_actions,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 66a5dd5a6cbe..61a014e3f688 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -170,14 +170,6 @@ struct mlx5_core_dev *mlx5_get_next_phys_dev(struct mlx5_core_dev *dev);
 void mlx5_dev_list_lock(void);
 void mlx5_dev_list_unlock(void);
 int mlx5_dev_list_trylock(void);
-int mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
-			       int reformat_type,
-			       size_t size,
-			       void *reformat_data,
-			       enum mlx5_flow_namespace_type namespace,
-			       u32 *packet_reformat_id);
-void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
-				  u32 packet_reformat_id);
 
 bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv);
 
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 37d0c08d0966..b1c026f1c8ba 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -203,4 +203,13 @@ int mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
 void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev,
 				u32 modify_header_id);
 
+int mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
+			       int reformat_type,
+			       size_t size,
+			       void *reformat_data,
+			       enum mlx5_flow_namespace_type namespace,
+			       u32 *packet_reformat_id);
+void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
+				  u32 packet_reformat_id);
+
 #endif
-- 
cgit v1.2.3


From adf8ed01e4fdd254efead978d633718ab01a7d5c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 31 Aug 2018 11:31:08 +0300
Subject: mac80211: add an optional TXQ for other PS-buffered frames

Some drivers may want to also use the TXQ abstraction with
non-data packets that need powersave buffering, so add a
hardware flag to allow this.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 17 ++++++++++++-----
 net/mac80211/debugfs.c     |  2 ++
 net/mac80211/debugfs_sta.c |  6 ++++--
 net/mac80211/rx.c          |  2 +-
 net/mac80211/sta_info.c    | 21 +++++++++++++--------
 net/mac80211/tx.c          | 37 +++++++++++++++++++++++++++----------
 6 files changed, 59 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index e248f5fe5b19..03e1dfd311f7 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -101,8 +101,9 @@
  * Drivers indicate that they use this model by implementing the .wake_tx_queue
  * driver operation.
  *
- * Intermediate queues (struct ieee80211_txq) are kept per-sta per-tid, with a
- * single per-vif queue for multicast data frames.
+ * Intermediate queues (struct ieee80211_txq) are kept per-sta per-tid, with
+ * another per-sta for non-data/non-mgmt and bufferable management frames, and
+ * a single per-vif queue for multicast data frames.
  *
  * The driver is expected to initialize its private per-queue data for stations
  * and interfaces in the .add_interface and .sta_add ops.
@@ -1843,7 +1844,8 @@ struct ieee80211_sta_rates {
  *	unlimited.
  * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not.
  * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control.
- * @txq: per-TID data TX queues (if driver uses the TXQ abstraction)
+ * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that
+ *	the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames
  */
 struct ieee80211_sta {
 	u32 supp_rates[NUM_NL80211_BANDS];
@@ -1884,7 +1886,7 @@ struct ieee80211_sta {
 	bool support_p2p_ps;
 	u16 max_rc_amsdu_len;
 
-	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS];
+	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1];
 
 	/* must be last */
 	u8 drv_priv[0] __aligned(sizeof(void *));
@@ -1918,7 +1920,8 @@ struct ieee80211_tx_control {
  *
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
  * @sta: station table entry, %NULL for per-vif queue
- * @tid: the TID for this queue (unused for per-vif queue)
+ * @tid: the TID for this queue (unused for per-vif queue),
+ *	%IEEE80211_NUM_TIDS for non-data (if enabled)
  * @ac: the AC for this queue
  * @drv_priv: driver private area, sized by hw->txq_data_size
  *
@@ -2131,6 +2134,9 @@ struct ieee80211_txq {
  * @IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP: The driver (or firmware) doesn't
  *	support QoS NDP for AP probing - that's most likely a driver bug.
  *
+ * @IEEE80211_HW_BUFF_MMPDU_TXQ: use the TXQ for bufferable MMPDUs, this of
+ *	course requires the driver to use TXQs to start with.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2176,6 +2182,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA,
 	IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP,
 	IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP,
+	IEEE80211_HW_BUFF_MMPDU_TXQ,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index b5adf3625d16..964663f49e58 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -3,6 +3,7 @@
  *
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * GPLv2
  *
@@ -214,6 +215,7 @@ static const char *hw_flag_names[] = {
 	FLAG(SUPPORTS_TDLS_BUFFER_STA),
 	FLAG(DEAUTH_NEED_MGD_TX_PREP),
 	FLAG(DOESNT_SUPPORT_QOS_NDP),
+	FLAG(BUFF_MMPDU_TXQ),
 #undef FLAG
 };
 
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index de66f685a107..95124978947f 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -141,7 +141,7 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
 {
 	struct sta_info *sta = file->private_data;
 	struct ieee80211_local *local = sta->local;
-	size_t bufsz = AQM_TXQ_ENTRY_LEN*(IEEE80211_NUM_TIDS+1);
+	size_t bufsz = AQM_TXQ_ENTRY_LEN * (IEEE80211_NUM_TIDS + 2);
 	char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
 	struct txq_info *txqi;
 	ssize_t rv;
@@ -163,7 +163,9 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
 		       bufsz+buf-p,
 		       "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n");
 
-	for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+	for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+		if (!sta->sta.txq[i])
+			continue;
 		txqi = to_txq_info(sta->sta.txq[i]);
 		p += scnprintf(p, bufsz+buf-p,
 			       "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s)\n",
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index bf6b7ad7f7cd..b79bc9055035 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1505,7 +1505,7 @@ static void sta_ps_start(struct sta_info *sta)
 	if (!sta->sta.txq[0])
 		return;
 
-	for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
+	for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
 		if (txq_has_queue(sta->sta.txq[tid]))
 			set_bit(tid, &sta->txq_buffered_tids);
 		else
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index a231d623b2d2..fb8c2252ac0e 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -113,7 +113,12 @@ static void __cleanup_single_sta(struct sta_info *sta)
 
 	if (sta->sta.txq[0]) {
 		for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
-			struct txq_info *txqi = to_txq_info(sta->sta.txq[i]);
+			struct txq_info *txqi;
+
+			if (!sta->sta.txq[i])
+				continue;
+
+			txqi = to_txq_info(sta->sta.txq[i]);
 
 			spin_lock_bh(&fq->lock);
 			ieee80211_txq_purge(local, txqi);
@@ -374,6 +379,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 		for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
 			struct txq_info *txq = txq_data + i * size;
 
+			/* might not do anything for the bufferable MMPDU TXQ */
 			ieee80211_txq_init(sdata, sta, txq, i);
 		}
 	}
@@ -1239,13 +1245,11 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
 	if (!ieee80211_hw_check(&local->hw, AP_LINK_PS))
 		drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta);
 
-	if (sta->sta.txq[0]) {
-		for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
-			if (!txq_has_queue(sta->sta.txq[i]))
-				continue;
+	for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+		if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i]))
+			continue;
 
-			drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
-		}
+		drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
 	}
 
 	skb_queue_head_init(&pending);
@@ -1683,7 +1687,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
 			return;
 
 		for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
-			if (!(driver_release_tids & BIT(tid)) ||
+			if (!sta->sta.txq[tid] ||
+			    !(driver_release_tids & BIT(tid)) ||
 			    txq_has_queue(sta->sta.txq[tid]))
 				continue;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 58502d29be54..5083905486c7 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1249,10 +1249,17 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
 	    (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
 		return NULL;
 
-	if (!ieee80211_is_data_present(hdr->frame_control))
-		return NULL;
-
-	if (sta) {
+	if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) {
+		if ((!ieee80211_is_mgmt(hdr->frame_control) ||
+		     ieee80211_is_bufferable_mmpdu(hdr->frame_control)) &&
+		    sta && sta->uploaded) {
+			/*
+			 * This will be NULL if the driver didn't set the
+			 * opt-in hardware flag.
+			 */
+			txq = sta->sta.txq[IEEE80211_NUM_TIDS];
+		}
+	} else if (sta) {
 		u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
 
 		if (!sta->uploaded)
@@ -1440,16 +1447,26 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 
 	txqi->txq.vif = &sdata->vif;
 
-	if (sta) {
-		txqi->txq.sta = &sta->sta;
-		sta->sta.txq[tid] = &txqi->txq;
-		txqi->txq.tid = tid;
-		txqi->txq.ac = ieee80211_ac_from_tid(tid);
-	} else {
+	if (!sta) {
 		sdata->vif.txq = &txqi->txq;
 		txqi->txq.tid = 0;
 		txqi->txq.ac = IEEE80211_AC_BE;
+
+		return;
+	}
+
+	if (tid == IEEE80211_NUM_TIDS) {
+		/* Drivers need to opt in to the bufferable MMPDU TXQ */
+		if (!ieee80211_hw_check(&sdata->local->hw, BUFF_MMPDU_TXQ))
+			return;
+		txqi->txq.ac = IEEE80211_AC_VO;
+	} else {
+		txqi->txq.ac = ieee80211_ac_from_tid(tid);
 	}
+
+	txqi->txq.sta = &sta->sta;
+	txqi->txq.tid = tid;
+	sta->sta.txq[tid] = &txqi->txq;
 }
 
 void ieee80211_txq_purge(struct ieee80211_local *local,
-- 
cgit v1.2.3


From 03512ceb60ae4be71ed3129dabb8625224c8ec40 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Fri, 31 Aug 2018 11:31:09 +0300
Subject: ieee80211: remove redundant leading zeroes

The defines of IEEE80211_HE_OPERATION_VHT_OPER_INFO and
IEEE80211_HE_OPERATION_MULTI_BSSID_AP have leading zeroes
that makes the number look like it is bigger than 32 bit.
This is misleading, remove it.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 9c03a7d5e400..17ea51d088ae 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1963,8 +1963,8 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info)
 #define IEEE80211_HE_OPERATION_TWT_REQUIRED			0x00000200
 #define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK		0x000ffc00
 #define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET		10
-#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR		0x000100000
-#define IEEE80211_HE_OPERATION_VHT_OPER_INFO			0x000200000
+#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR		0x00100000
+#define IEEE80211_HE_OPERATION_VHT_OPER_INFO			0x00200000
 #define IEEE80211_HE_OPERATION_MULTI_BSSID_AP			0x10000000
 #define IEEE80211_HE_OPERATION_TX_BSSID_INDICATOR		0x20000000
 #define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED		0x40000000
-- 
cgit v1.2.3


From 244eb9ae797385c2ed244f6bdf0534fcaa6f0d33 Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Fri, 31 Aug 2018 11:31:14 +0300
Subject: cfg80211: add he_capabilities (ext) IE to AP settings

Same as for HT and VHT.
This helps the lower level to know whether the AP supports HE.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 ++
 net/wireless/nl80211.c | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 33c2a1d2a8d2..9f3ed79c39d7 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -849,6 +849,7 @@ struct cfg80211_bitrate_mask {
  * @beacon_rate: bitrate to be used for beacons
  * @ht_cap: HT capabilities (or %NULL if HT isn't enabled)
  * @vht_cap: VHT capabilities (or %NULL if VHT isn't enabled)
+ * @he_cap: HE capabilities (or %NULL if HE isn't enabled)
  * @ht_required: stations must support HT
  * @vht_required: stations must support VHT
  */
@@ -874,6 +875,7 @@ struct cfg80211_ap_settings {
 
 	const struct ieee80211_ht_cap *ht_cap;
 	const struct ieee80211_vht_cap *vht_cap;
+	const struct ieee80211_he_cap_elem *he_cap;
 	bool ht_required, vht_required;
 };
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index fbd0747a5a9d..d5f9b5235cdd 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4095,6 +4095,9 @@ static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
 	cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len);
 	if (cap && cap[1] >= sizeof(*params->vht_cap))
 		params->vht_cap = (void *)(cap + 2);
+	cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ies, ies_len);
+	if (cap && cap[1] >= sizeof(*params->he_cap) + 1)
+		params->he_cap = (void *)(cap + 3);
 }
 
 static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
-- 
cgit v1.2.3


From b0aa75f0b1b2e6bc77128fab36c8ed87e84917cc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 31 Aug 2018 11:31:16 +0300
Subject: ieee80211: add new VHT capability fields/parsing

IEEE 802.11-2016 extended the VHT capability fields to allow
indicating the number of spatial streams depending on the
actually used bandwidth, add support for decoding this.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  35 ++++++++++++++-
 net/wireless/util.c       | 109 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 17ea51d088ae..280600a10111 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1460,13 +1460,16 @@ struct ieee80211_ht_operation {
  *	STA can receive. Rate expressed in units of 1 Mbps.
  *	If this field is 0 this value should not be used to
  *	consider the highest RX data rate supported.
- *	The top 3 bits of this field are reserved.
+ *	The top 3 bits of this field indicate the Maximum NSTS,total
+ *	(a beamformee capability.)
  * @tx_mcs_map: TX MCS map 2 bits for each stream, total 8 streams
  * @tx_highest: Indicates highest long GI VHT PPDU data rate
  *	STA can transmit. Rate expressed in units of 1 Mbps.
  *	If this field is 0 this value should not be used to
  *	consider the highest TX data rate supported.
- *	The top 3 bits of this field are reserved.
+ *	The top 2 bits of this field are reserved, the
+ *	3rd bit from the top indiciates VHT Extended NSS BW
+ *	Capability.
  */
 struct ieee80211_vht_mcs_info {
 	__le16 rx_mcs_map;
@@ -1475,6 +1478,13 @@ struct ieee80211_vht_mcs_info {
 	__le16 tx_highest;
 } __packed;
 
+/* for rx_highest */
+#define IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT	13
+#define IEEE80211_VHT_MAX_NSTS_TOTAL_MASK	(7 << IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT)
+
+/* for tx_highest */
+#define IEEE80211_VHT_EXT_NSS_BW_CAPABLE	(1 << 13)
+
 /**
  * enum ieee80211_vht_mcs_support - VHT MCS support definitions
  * @IEEE80211_VHT_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the
@@ -1650,6 +1660,7 @@ struct ieee80211_mu_edca_param_set {
 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ		0x00000004
 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ	0x00000008
 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK			0x0000000C
+#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_SHIFT			2
 #define IEEE80211_VHT_CAP_RXLDPC				0x00000010
 #define IEEE80211_VHT_CAP_SHORT_GI_80				0x00000020
 #define IEEE80211_VHT_CAP_SHORT_GI_160				0x00000040
@@ -1678,6 +1689,26 @@ struct ieee80211_mu_edca_param_set {
 #define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB	0x0c000000
 #define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN			0x10000000
 #define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN			0x20000000
+#define IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT			30
+#define IEEE80211_VHT_CAP_EXT_NSS_BW_MASK			0xc0000000
+
+/**
+ * ieee80211_get_vht_max_nss - return max NSS for a given bandwidth/MCS
+ * @cap: VHT capabilities of the peer
+ * @bw: bandwidth to use
+ * @mcs: MCS index to use
+ * @ext_nss_bw_capable: indicates whether or not the local transmitter
+ *	(rate scaling algorithm) can deal with the new logic
+ *	(dot11VHTExtendedNSSBWCapable)
+ *
+ * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can
+ * vary for a given BW/MCS. This function parses the data.
+ *
+ * Note: This function is exported by cfg80211.
+ */
+int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
+			      enum ieee80211_vht_chanwidth bw,
+			      int mcs, bool ext_nss_bw_capable);
 
 /* 802.11ax HE MAC capabilities */
 #define IEEE80211_HE_MAC_CAP0_HTC_HE				0x01
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 4293f980e9c4..ef14d80ca03e 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -5,17 +5,20 @@
  * Copyright 2007-2009	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2017	Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  */
 #include <linux/export.h>
 #include <linux/bitops.h>
 #include <linux/etherdevice.h>
 #include <linux/slab.h>
+#include <linux/ieee80211.h>
 #include <net/cfg80211.h>
 #include <net/ip.h>
 #include <net/dsfield.h>
 #include <linux/if_vlan.h>
 #include <linux/mpls.h>
 #include <linux/gcd.h>
+#include <linux/bitfield.h>
 #include "core.h"
 #include "rdev-ops.h"
 
@@ -1938,3 +1941,109 @@ void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr)
 	netif_rx_ni(skb);
 }
 EXPORT_SYMBOL(cfg80211_send_layer2_update);
+
+int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
+			      enum ieee80211_vht_chanwidth bw,
+			      int mcs, bool ext_nss_bw_capable)
+{
+	u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map);
+	int max_vht_nss = 0;
+	int ext_nss_bw;
+	int supp_width;
+	int i, mcs_encoding;
+
+	if (map == 0xffff)
+		return 0;
+
+	if (WARN_ON(mcs > 9))
+		return 0;
+	if (mcs <= 7)
+		mcs_encoding = 0;
+	else if (mcs == 8)
+		mcs_encoding = 1;
+	else
+		mcs_encoding = 2;
+
+	/* find max_vht_nss for the given MCS */
+	for (i = 7; i >= 0; i--) {
+		int supp = (map >> (2 * i)) & 3;
+
+		if (supp == 3)
+			continue;
+
+		if (supp >= mcs_encoding) {
+			max_vht_nss = i;
+			break;
+		}
+	}
+
+	if (!(cap->supp_mcs.tx_mcs_map &
+			cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE)))
+		return max_vht_nss;
+
+	ext_nss_bw = le32_get_bits(cap->vht_cap_info,
+				   IEEE80211_VHT_CAP_EXT_NSS_BW_MASK);
+	supp_width = le32_get_bits(cap->vht_cap_info,
+				   IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK);
+
+	/* if not capable, treat ext_nss_bw as 0 */
+	if (!ext_nss_bw_capable)
+		ext_nss_bw = 0;
+
+	/* This is invalid */
+	if (supp_width == 3)
+		return 0;
+
+	/* This is an invalid combination so pretend nothing is supported */
+	if (supp_width == 2 && (ext_nss_bw == 1 || ext_nss_bw == 2))
+		return 0;
+
+	/*
+	 * Cover all the special cases according to IEEE 802.11-2016
+	 * Table 9-250. All other cases are either factor of 1 or not
+	 * valid/supported.
+	 */
+	switch (bw) {
+	case IEEE80211_VHT_CHANWIDTH_USE_HT:
+	case IEEE80211_VHT_CHANWIDTH_80MHZ:
+		if ((supp_width == 1 || supp_width == 2) &&
+		    ext_nss_bw == 3)
+			return 2 * max_vht_nss;
+		break;
+	case IEEE80211_VHT_CHANWIDTH_160MHZ:
+		if (supp_width == 0 &&
+		    (ext_nss_bw == 1 || ext_nss_bw == 2))
+			return DIV_ROUND_UP(max_vht_nss, 2);
+		if (supp_width == 0 &&
+		    ext_nss_bw == 3)
+			return DIV_ROUND_UP(3 * max_vht_nss, 4);
+		if (supp_width == 1 &&
+		    ext_nss_bw == 3)
+			return 2 * max_vht_nss;
+		break;
+	case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
+		if (supp_width == 0 &&
+		    (ext_nss_bw == 1 || ext_nss_bw == 2))
+			return 0; /* not possible */
+		if (supp_width == 0 &&
+		    ext_nss_bw == 2)
+			return DIV_ROUND_UP(max_vht_nss, 2);
+		if (supp_width == 0 &&
+		    ext_nss_bw == 3)
+			return DIV_ROUND_UP(3 * max_vht_nss, 4);
+		if (supp_width == 1 &&
+		    ext_nss_bw == 0)
+			return 0; /* not possible */
+		if (supp_width == 1 &&
+		    ext_nss_bw == 1)
+			return DIV_ROUND_UP(max_vht_nss, 2);
+		if (supp_width == 1 &&
+		    ext_nss_bw == 2)
+			return DIV_ROUND_UP(3 * max_vht_nss, 4);
+		break;
+	}
+
+	/* not covered or invalid combination received */
+	return max_vht_nss;
+}
+EXPORT_SYMBOL(ieee80211_get_vht_max_nss);
-- 
cgit v1.2.3


From 09b4a4faf9d037990ac4f8110dd944b27b42d5df Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 31 Aug 2018 11:31:17 +0300
Subject: mac80211: introduce capability flags for VHT EXT NSS support

Depending on whether or not rate control supports selecting
rates depending on the bandwidth, we can use VHT extended
NSS support. In essence, this is dot11VHTExtendedNSSBWCapable
from the spec, since depending on that we'll need to parse
the bandwidth.

If needed, also set/clear the VHT Capability Element bit for
this capability so that we don't advertise it erroneously or
don't advertise it when we actually use it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     | 22 +++++++++++++++++++
 net/mac80211/debugfs.c     |  1 +
 net/mac80211/ieee80211_i.h |  3 +++
 net/mac80211/main.c        | 53 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 79 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 03e1dfd311f7..00e2e9909d45 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2137,6 +2137,12 @@ struct ieee80211_txq {
  * @IEEE80211_HW_BUFF_MMPDU_TXQ: use the TXQ for bufferable MMPDUs, this of
  *	course requires the driver to use TXQs to start with.
  *
+ * @IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW: (Hardware) rate control supports VHT
+ *	extended NSS BW (dot11VHTExtendedNSSBWCapable). This flag will be set if
+ *	the selected rate control algorithm sets %RATE_CTRL_CAPA_VHT_EXT_NSS_BW
+ *	but if the rate control is built-in then it must be set by the driver.
+ *	See also the documentation for that flag.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2183,6 +2189,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP,
 	IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP,
 	IEEE80211_HW_BUFF_MMPDU_TXQ,
+	IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
@@ -5655,7 +5662,22 @@ struct ieee80211_tx_rate_control {
 	bool bss;
 };
 
+/**
+ * enum rate_control_capabilities - rate control capabilities
+ */
+enum rate_control_capabilities {
+	/**
+	 * @RATE_CTRL_CAPA_VHT_EXT_NSS_BW:
+	 * Support for extended NSS BW support (dot11VHTExtendedNSSCapable)
+	 * Note that this is only looked at if the minimum number of chains
+	 * that the AP uses is < the number of TX chains the hardware has,
+	 * otherwise the NSS difference doesn't bother us.
+	 */
+	RATE_CTRL_CAPA_VHT_EXT_NSS_BW = BIT(0),
+};
+
 struct rate_control_ops {
+	unsigned long capa;
 	const char *name;
 	void *(*alloc)(struct ieee80211_hw *hw, struct dentry *debugfsdir);
 	void (*free)(void *priv);
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 964663f49e58..fb45eb5d1dc4 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -216,6 +216,7 @@ static const char *hw_flag_names[] = {
 	FLAG(DEAUTH_NEED_MGD_TX_PREP),
 	FLAG(DOESNT_SUPPORT_QOS_NDP),
 	FLAG(BUFF_MMPDU_TXQ),
+	FLAG(SUPPORTS_VHT_EXT_NSS_BW),
 #undef FLAG
 };
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 348a52cefb43..08da90adeaea 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1199,6 +1199,9 @@ struct ieee80211_local {
 	/* number of RX chains the hardware has */
 	u8 rx_chains;
 
+	/* bitmap of which sbands were copied */
+	u8 sband_allocated;
+
 	int tx_headroom; /* required headroom for hardware/radiotap */
 
 	/* Tasklet and skb queue to process calls from IRQ mode. All frames
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 2d51eca46aa0..c78629f8b7a0 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -4,6 +4,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2017     Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -1158,6 +1159,51 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		goto fail_rate;
 	}
 
+	if (local->rate_ctrl) {
+		clear_bit(IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW, hw->flags);
+		if (local->rate_ctrl->ops->capa & RATE_CTRL_CAPA_VHT_EXT_NSS_BW)
+			ieee80211_hw_set(hw, SUPPORTS_VHT_EXT_NSS_BW);
+	}
+
+	/*
+	 * If the VHT capabilities don't have IEEE80211_VHT_EXT_NSS_BW_CAPABLE,
+	 * or have it when we don't, copy the sband structure and set/clear it.
+	 * This is necessary because rate scaling algorithms could be switched
+	 * and have different support values.
+	 * Print a message so that in the common case the reallocation can be
+	 * avoided.
+	 */
+	BUILD_BUG_ON(NUM_NL80211_BANDS > 8 * sizeof(local->sband_allocated));
+	for (band = 0; band < NUM_NL80211_BANDS; band++) {
+		struct ieee80211_supported_band *sband;
+		bool local_cap, ie_cap;
+
+		local_cap = ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW);
+
+		sband = local->hw.wiphy->bands[band];
+		if (!sband || !sband->vht_cap.vht_supported)
+			continue;
+
+		ie_cap = !!(sband->vht_cap.vht_mcs.tx_highest &
+			    cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE));
+
+		if (local_cap == ie_cap)
+			continue;
+
+		sband = kmemdup(sband, sizeof(*sband), GFP_KERNEL);
+		if (!sband)
+			goto fail_rate;
+
+		wiphy_dbg(hw->wiphy, "copying sband (band %d) due to VHT EXT NSS BW flag\n",
+			  band);
+
+		sband->vht_cap.vht_mcs.tx_highest ^=
+			cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE);
+
+		local->hw.wiphy->bands[band] = sband;
+		local->sband_allocated |= BIT(band);
+	}
+
 	/* add one default STA interface if supported */
 	if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) &&
 	    !ieee80211_hw_check(hw, NO_AUTO_VIF)) {
@@ -1276,6 +1322,7 @@ static int ieee80211_free_ack_frame(int id, void *p, void *data)
 void ieee80211_free_hw(struct ieee80211_hw *hw)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
+	enum nl80211_band band;
 
 	mutex_destroy(&local->iflist_mtx);
 	mutex_destroy(&local->mtx);
@@ -1291,6 +1338,12 @@ void ieee80211_free_hw(struct ieee80211_hw *hw)
 
 	ieee80211_free_led_names(local);
 
+	for (band = 0; band < NUM_NL80211_BANDS; band++) {
+		if (!(local->sband_allocated & BIT(band)))
+			continue;
+		kfree(local->hw.wiphy->bands[band]);
+	}
+
 	wiphy_free(local->hw.wiphy);
 }
 EXPORT_SYMBOL(ieee80211_free_hw);
-- 
cgit v1.2.3


From 70e53669c4c41b0fc043cb0bcb518b53428edf64 Mon Sep 17 00:00:00 2001
From: Wen Gong <wgong@codeaurora.org>
Date: Wed, 8 Aug 2018 18:40:01 +0800
Subject: mac80211: Store sk_pacing_shift in ieee80211_hw

Make it possibly for drivers to adjust the default skb_pacing_shift
by storing it in the hardware struct.

Signed-off-by: Wen Gong <wgong@codeaurora.org>
[adjust commit log, move & adjust comment]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  5 +++++
 net/mac80211/main.c    | 12 ++++++++++++
 net/mac80211/tx.c      |  8 +-------
 3 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 00e2e9909d45..f8247d2658ac 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2308,6 +2308,10 @@ enum ieee80211_hw_flags {
  *	supported by HW.
  * @max_nan_de_entries: maximum number of NAN DE functions supported by the
  *	device.
+ *
+ * @tx_sk_pacing_shift: Pacing shift to set on TCP sockets when frames from
+ *	them are encountered. The default should typically not be changed,
+ *	unless the driver has good reasons for needing more buffers.
  */
 struct ieee80211_hw {
 	struct ieee80211_conf conf;
@@ -2343,6 +2347,7 @@ struct ieee80211_hw {
 	u8 n_cipher_schemes;
 	const struct ieee80211_cipher_scheme *cipher_schemes;
 	u8 max_nan_de_entries;
+	u8 tx_sk_pacing_shift;
 };
 
 static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index c78629f8b7a0..77381017bac7 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -611,6 +611,18 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	local->ops = ops;
 	local->use_chanctx = use_chanctx;
 
+	/*
+	 * We need a bit of data queued to build aggregates properly, so
+	 * instruct the TCP stack to allow more than a single ms of data
+	 * to be queued in the stack. The value is a bit-shift of 1
+	 * second, so 8 is ~4ms of queued data. Only affects local TCP
+	 * sockets.
+	 * This is the default, anyhow - drivers may need to override it
+	 * for local reasons (longer buffers, longer completion time, or
+	 * similar).
+	 */
+	local->hw.tx_sk_pacing_shift = 8;
+
 	/* set up some defaults */
 	local->hw.queues = 1;
 	local->hw.max_rates = 1;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 5083905486c7..7d34222337c0 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3625,13 +3625,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 	if (!IS_ERR_OR_NULL(sta)) {
 		struct ieee80211_fast_tx *fast_tx;
 
-		/* We need a bit of data queued to build aggregates properly, so
-		 * instruct the TCP stack to allow more than a single ms of data
-		 * to be queued in the stack. The value is a bit-shift of 1
-		 * second, so 8 is ~4ms of queued data. Only affects local TCP
-		 * sockets.
-		 */
-		sk_pacing_shift_update(skb->sk, 8);
+		sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift);
 
 		fast_tx = rcu_dereference(sta->fast_tx);
 
-- 
cgit v1.2.3


From d1332e7be25088383527e3de325930bea64780cb Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Fri, 31 Aug 2018 11:31:20 +0300
Subject: mac80211: support radiotap L-SIG data

As before with HE, the data needs to be provided by the
driver in the skb head, since there's not enough space
in the skb CB.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/ieee80211_radiotap.h | 15 +++++++++++++++
 include/net/mac80211.h           |  2 ++
 net/mac80211/rx.c                | 21 +++++++++++++++++++++
 3 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index feef706e1158..80d543902b8b 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -75,6 +75,7 @@ enum ieee80211_radiotap_presence {
 	IEEE80211_RADIOTAP_TIMESTAMP = 22,
 	IEEE80211_RADIOTAP_HE = 23,
 	IEEE80211_RADIOTAP_HE_MU = 24,
+	IEEE80211_RADIOTAP_LSIG = 27,
 
 	/* valid in every it_present bitmap, even vendor namespaces */
 	IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE = 29,
@@ -325,6 +326,20 @@ enum ieee80211_radiotap_he_mu_bits {
 	IEEE80211_RADIOTAP_HE_MU_FLAGS2_CH2_CTR_26T_RU		= 0x0800,
 };
 
+enum ieee80211_radiotap_lsig_data1 {
+	IEEE80211_RADIOTAP_LSIG_DATA1_RATE_KNOWN		= 0x0001,
+	IEEE80211_RADIOTAP_LSIG_DATA1_LENGTH_KNOWN		= 0x0002,
+};
+
+enum ieee80211_radiotap_lsig_data2 {
+	IEEE80211_RADIOTAP_LSIG_DATA2_RATE			= 0x000f,
+	IEEE80211_RADIOTAP_LSIG_DATA2_LENGTH			= 0xfff0,
+};
+
+struct ieee80211_radiotap_lsig {
+	__le16 data1, data2;
+};
+
 /**
  * ieee80211_get_radiotap_len - get radiotap header length
  */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f8247d2658ac..3cc1ca17a1a8 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1141,6 +1141,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  *	from the RX info data, so leave those zeroed when building this data)
  * @RX_FLAG_RADIOTAP_HE_MU: HE MU radiotap data is present
  *	(&struct ieee80211_radiotap_he_mu)
+ * @RX_FLAG_RADIOTAP_LSIG: L-SIG radiotap data is present
  */
 enum mac80211_rx_flags {
 	RX_FLAG_MMIC_ERROR		= BIT(0),
@@ -1171,6 +1172,7 @@ enum mac80211_rx_flags {
 	RX_FLAG_AMPDU_EOF_BIT_KNOWN	= BIT(25),
 	RX_FLAG_RADIOTAP_HE		= BIT(26),
 	RX_FLAG_RADIOTAP_HE_MU		= BIT(27),
+	RX_FLAG_RADIOTAP_LSIG		= BIT(28),
 };
 
 /**
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index b79bc9055035..4376a4ce8c25 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -189,6 +189,12 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 		BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12);
 	}
 
+	if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
+		len = ALIGN(len, 2);
+		len += 4;
+		BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_lsig) != 4);
+	}
+
 	if (status->chains) {
 		/* antenna and antenna signal fields */
 		len += 2 * hweight8(status->chains);
@@ -279,6 +285,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	struct ieee80211_vendor_radiotap rtap = {};
 	struct ieee80211_radiotap_he he = {};
 	struct ieee80211_radiotap_he_mu he_mu = {};
+	struct ieee80211_radiotap_lsig lsig = {};
 
 	if (status->flag & RX_FLAG_RADIOTAP_HE) {
 		he = *(struct ieee80211_radiotap_he *)skb->data;
@@ -291,6 +298,11 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 		skb_pull(skb, sizeof(he_mu));
 	}
 
+	if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
+		lsig = *(struct ieee80211_radiotap_lsig *)skb->data;
+		skb_pull(skb, sizeof(lsig));
+	}
+
 	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
 		rtap = *(struct ieee80211_vendor_radiotap *)skb->data;
 		/* rtap.len and rtap.pad are undone immediately */
@@ -630,6 +642,15 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 		pos += sizeof(he_mu);
 	}
 
+	if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
+		/* ensure 2 byte alignment */
+		while ((pos - (u8 *)rthdr) & 1)
+			pos++;
+		rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_LSIG);
+		memcpy(pos, &lsig, sizeof(lsig));
+		pos += sizeof(lsig);
+	}
+
 	for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
 		*pos++ = status->chain_signal[chain];
 		*pos++ = chain;
-- 
cgit v1.2.3


From 2b815b04dfe45d1278fd4137675fe1398f656b0a Mon Sep 17 00:00:00 2001
From: Alexander Wetzel <alexander@wetzel-home.de>
Date: Fri, 31 Aug 2018 15:00:37 +0200
Subject: nl80211: Add CAN_REPLACE_PTK0 API

Drivers able to correctly replace a in-use key should set
@NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 to allow the user space (e.g.
hostapd or wpa_supplicant) to rekey PTK keys.

The user space must detect a PTK rekey attempt and only go ahead with it
when the driver has set this flag. If the driver is not supporting the
feature the user space either must not replace the PTK key or perform a
full re-association instead.

Ignoring this flag and continuing to rekey the connection can still work
but has to be considered insecure and broken. Depending on the driver it
can leak clear text packets or freeze the connection and is only
supported to allow the user space to be updated.

Signed-off-by: Alexander Wetzel <alexander@wetzel-home.de>
Reviewed-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 1766a12b231c..cfc94178d608 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5226,6 +5226,11 @@ enum nl80211_feature_flags {
  *	except for supported rates from the probe request content if requested
  *	by the %NL80211_SCAN_FLAG_MIN_PREQ_CONTENT flag.
  *
+ * @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0: Driver/device confirm that they are
+ *      able to rekey an in-use key correctly. Userspace must not rekey PTK keys
+ *      if this flag is not set. Ignoring this can leak clear text packets and/or
+ *      freeze the connection.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -5263,6 +5268,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_TXQS,
 	NL80211_EXT_FEATURE_SCAN_RANDOM_SN,
 	NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT,
+	NL80211_EXT_FEATURE_CAN_REPLACE_PTK0,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
-- 
cgit v1.2.3


From 62872a9b9a106f00360193f428451c321ec2e823 Mon Sep 17 00:00:00 2001
From: Alexander Wetzel <alexander@wetzel-home.de>
Date: Fri, 31 Aug 2018 15:00:38 +0200
Subject: mac80211: Fix PTK rekey freezes and clear text leak

Rekeying PTK keys without "Extended Key ID for Individually Addressed
Frames" did use a procedure not suitable to replace in-use keys and
could caused the following issues:

 1) Freeze caused by incoming frames:
    If the local STA installed the key prior to the remote STA we still
    had the old key active in the hardware when mac80211 switched over
    to the new key.
    Therefore there was a window where the card could hand over frames
    decoded with the old key to mac80211 and bump the new PN (IV) value
    to an incorrect high number. When it happened the local replay
    detection silently started to drop all frames sent with the new key.

 2) Freeze caused by outgoing frames:
    If mac80211 was providing the PN (IV) and handed over a clear text
    frame for encryption to the hardware prior to a key change the
    driver/card could have processed the queued frame after switching
    to the new key. This bumped the PN value on the remote STA to an
    incorrect high number, tricking the remote STA to discard all frames
    we sent later.

 3) Freeze caused by RX aggregation reorder buffer:
    An aggregation session started with the old key and ending after the
    switch to the new key also bumped the PN to an incorrect high number,
    freezing the connection quite similar to 1).

 4) Freeze caused by repeating lost frames in an aggregation session:
    A driver could repeat a lost frame and encrypt it with the new key
    while in a TX aggregation session without updating the PN for the
    new key. This also could freeze connections similar to 2).

 5) Clear text leak:
    Removing encryption offload from the card cleared the encryption
    offload flag only after the card had deleted the key and we did not
    stop TX during the rekey. The driver/card could therefore get
    unencrypted frames from mac80211 while no longer be instructed to
    encrypt them.

To prevent those issues the key install logic has been changed:
 - Mac80211 divers known to be able to rekey PTK0 keys have to set
   @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0,
 - mac80211 stops queuing frames depending on the key during the replace
 - the key is first replaced in the hardware and after that in mac80211
 - and mac80211 stops/blocks new aggregation sessions during the rekey.

For drivers not setting
@NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 the user space must avoid PTK
rekeys if "Extended Key ID for Individually Addressed Frames" is not
being used. Rekeys for mac80211 drivers without this flag will generate a
warning and use an extra call to ieee80211_flush_queues() to both
highlight and try to prevent the issues with not updated drivers.

The core of the fix changes the key install procedure from:
 - atomic switch over to the new key in mac80211
 - remove the old key in the hardware (stops encryption offloading, fall
   back to software encryption with a potential clear text packet leak
   in between)
 - delete the inactive old key in mac80211
 - enable hardware encryption offloading for the new key
to:
 - if it's a PTK mark the old key as tainted to drop TX frames with the
   outgoing key
 - replace the key in hardware with the new one
 - atomic switch over to the new (not marked as tainted) key in
   mac80211 (which also resumes TX)
 - delete the inactive old key in mac80211

With the new sequence the hardware will be unable to decrypt frames
encrypted with the old key prior to switching to the new key in mac80211
and thus prevent PNs from packets decrypted with the old key to be
accounted against the new key.

For that to work the drivers have to provide a clear boundary.
Mac80211 drivers setting @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 confirm
to provide it and mac80211 will then be able to correctly rekey in-use
PTK keys with those drivers.

The mac80211 requirements for drivers to set the flag have been added to
the "Hardware crypto acceleration" documentation section. It drills down
to:
The drivers must not hand over frames decrypted with the old key to
mac80211 once the call to set_key() with %DISABLE_KEY has been
completed. It's allowed to either drop or continue to use the old key
for any outgoing frames which are already in the queues, but it must not
send out any of them unencrypted or encrypted with the new key.

Even with the new boundary in place aggregation sessions with the
reorder buffer are problematic:
RX aggregation session started prior and completed after the rekey could
still dump frames received with the old key at mac80211 after it
switched over to the new key. This is side stepped by stopping all (RX
and TX) aggregation sessions when replacing a PTK key and hardware key
offloading.
Stopping TX aggregation sessions avoids the need to get
the PNs (IVs) updated in frames prepared for the old key and
(re)transmitted after the switch to the new key. As a bonus it improves
the compatibility when the remote STA is not handling rekeys as it
should.

When using software crypto aggregation sessions are not stopped.
Mac80211 won't be able to decode the dangerous frames and discard them
without special handling.

Signed-off-by: Alexander Wetzel <alexander@wetzel-home.de>
[trim overly long rekey warning]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  13 ++++++
 net/mac80211/key.c     | 110 ++++++++++++++++++++++++++++++++++++++++---------
 net/mac80211/tx.c      |   4 ++
 3 files changed, 107 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 3cc1ca17a1a8..8c26d2d36cbe 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2531,6 +2531,19 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  * The set_default_unicast_key() call updates the default WEP key index
  * configured to the hardware for WEP encryption type. This is required
  * for devices that support offload of data packets (e.g. ARP responses).
+ *
+ * Mac80211 drivers should set the @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 flag
+ * when they are able to replace in-use PTK keys according to to following
+ * requirements:
+ * 1) They do not hand over frames decrypted with the old key to
+      mac80211 once the call to set_key() with command %DISABLE_KEY has been
+      completed when also setting @IEEE80211_KEY_FLAG_GENERATE_IV for any key,
+   2) either drop or continue to use the old key for any outgoing frames queued
+      at the time of the key deletion (including re-transmits),
+   3) never send out a frame queued prior to the set_key() %SET_KEY command
+      encrypted with the new key and
+   4) never send out a frame unencrypted when it should be encrypted.
+   Mac80211 will not queue any new frames for a deleted key to the driver.
  */
 
 /**
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index c054ac85793c..d6eeace7b83a 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -248,6 +248,7 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
 	      (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM)))
 		increment_tailroom_need_count(sdata);
 
+	key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
 	ret = drv_set_key(key->local, DISABLE_KEY, sdata,
 			  sta ? &sta->sta : NULL, &key->conf);
 
@@ -256,8 +257,65 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
 			  "failed to remove key (%d, %pM) from hardware (%d)\n",
 			  key->conf.keyidx,
 			  sta ? sta->sta.addr : bcast_addr, ret);
+}
 
-	key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
+static int ieee80211_hw_key_replace(struct ieee80211_key *old_key,
+				    struct ieee80211_key *new_key,
+				    bool ptk0rekey)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_local *local;
+	struct sta_info *sta;
+	int ret;
+
+	/* Aggregation sessions are OK when running on SW crypto.
+	 * A broken remote STA may cause issues not observed with HW
+	 * crypto, though.
+	 */
+	if (!(old_key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
+		return 0;
+
+	assert_key_lock(old_key->local);
+	sta = old_key->sta;
+
+	/* PTK only using key ID 0 needs special handling on rekey */
+	if (new_key && sta && ptk0rekey) {
+		local = old_key->local;
+		sdata = old_key->sdata;
+
+		/* Stop TX till we are on the new key */
+		old_key->flags |= KEY_FLAG_TAINTED;
+		ieee80211_clear_fast_xmit(sta);
+
+		/* Aggregation sessions during rekey are complicated due to the
+		 * reorder buffer and retransmits. Side step that by blocking
+		 * aggregation during rekey and tear down running sessions.
+		 */
+		if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) {
+			set_sta_flag(sta, WLAN_STA_BLOCK_BA);
+			ieee80211_sta_tear_down_BA_sessions(sta,
+							    AGG_STOP_LOCAL_REQUEST);
+		}
+
+		if (!wiphy_ext_feature_isset(local->hw.wiphy,
+					     NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) {
+			pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.",
+					    sta->sta.addr);
+			/* Flushing the driver queues *may* help prevent
+			 * the clear text leaks and freezes.
+			 */
+			ieee80211_flush_queues(local, sdata, false);
+		}
+	}
+
+	ieee80211_key_disable_hw_accel(old_key);
+
+	if (new_key)
+		ret = ieee80211_key_enable_hw_accel(new_key);
+	else
+		ret = 0;
+
+	return ret;
 }
 
 static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
@@ -316,38 +374,56 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
 }
 
 
-static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
+static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 				  struct sta_info *sta,
 				  bool pairwise,
 				  struct ieee80211_key *old,
 				  struct ieee80211_key *new)
 {
 	int idx;
+	int ret;
 	bool defunikey, defmultikey, defmgmtkey;
 
 	/* caller must provide at least one old/new */
 	if (WARN_ON(!new && !old))
-		return;
+		return 0;
 
 	if (new)
 		list_add_tail_rcu(&new->list, &sdata->key_list);
 
 	WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
 
-	if (old)
+	if (old) {
 		idx = old->conf.keyidx;
-	else
+		/* TODO: proper implement and test "Extended Key ID for
+		 * Individually Addressed Frames" from IEEE 802.11-2016.
+		 * Till then always assume only key ID 0 is used for
+		 * pairwise keys.*/
+		ret = ieee80211_hw_key_replace(old, new, pairwise);
+	} else {
 		idx = new->conf.keyidx;
+		if (new && !new->local->wowlan)
+			ret = ieee80211_key_enable_hw_accel(new);
+		else
+			ret = 0;
+	}
+
+	if (ret)
+		return ret;
 
 	if (sta) {
 		if (pairwise) {
 			rcu_assign_pointer(sta->ptk[idx], new);
 			sta->ptk_idx = idx;
-			ieee80211_check_fast_xmit(sta);
+			if (new) {
+				clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
+				ieee80211_check_fast_xmit(sta);
+			}
 		} else {
 			rcu_assign_pointer(sta->gtk[idx], new);
 		}
-		ieee80211_check_fast_rx(sta);
+		if (new)
+			ieee80211_check_fast_rx(sta);
 	} else {
 		defunikey = old &&
 			old == key_mtx_dereference(sdata->local,
@@ -380,6 +456,8 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 
 	if (old)
 		list_del_rcu(&old->list);
+
+	return 0;
 }
 
 struct ieee80211_key *
@@ -575,9 +653,6 @@ static void ieee80211_key_free_common(struct ieee80211_key *key)
 static void __ieee80211_key_destroy(struct ieee80211_key *key,
 				    bool delay_tailroom)
 {
-	if (key->local)
-		ieee80211_key_disable_hw_accel(key);
-
 	if (key->local) {
 		struct ieee80211_sub_if_data *sdata = key->sdata;
 
@@ -654,7 +729,6 @@ int ieee80211_key_link(struct ieee80211_key *key,
 		       struct ieee80211_sub_if_data *sdata,
 		       struct sta_info *sta)
 {
-	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_key *old_key;
 	int idx = key->conf.keyidx;
 	bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
@@ -691,17 +765,13 @@ int ieee80211_key_link(struct ieee80211_key *key,
 
 	increment_tailroom_need_count(sdata);
 
-	ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
-	ieee80211_key_destroy(old_key, delay_tailroom);
-
-	ieee80211_debugfs_key_add(key);
+	ret = ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
 
-	if (!local->wowlan) {
-		ret = ieee80211_key_enable_hw_accel(key);
-		if (ret)
-			ieee80211_key_free(key, delay_tailroom);
+	if (!ret) {
+		ieee80211_debugfs_key_add(key);
+		ieee80211_key_destroy(old_key, delay_tailroom);
 	} else {
-		ret = 0;
+		ieee80211_key_free(key, delay_tailroom);
 	}
 
  out:
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 7d34222337c0..e88547842239 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2968,6 +2968,10 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
 		if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
 			goto out;
 
+		/* Key is being removed */
+		if (build.key->flags & KEY_FLAG_TAINTED)
+			goto out;
+
 		switch (build.key->conf.cipher) {
 		case WLAN_CIPHER_SUITE_CCMP:
 		case WLAN_CIPHER_SUITE_CCMP_256:
-- 
cgit v1.2.3


From c3d1f8752802b2e1fb12c73bee50035bc125bc54 Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Wed, 5 Sep 2018 08:06:06 +0300
Subject: mac80211: support reporting 0-length PSDU in radiotap

For certain sounding frames, it may be useful to report them
to userspace even though they don't have a PSDU in order to
determine the PHY parameters (e.g. VHT rate/stream config.)
Add support for this to mac80211.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/ieee80211_radiotap.h |  6 ++++++
 include/net/mac80211.h           |  7 +++++++
 net/mac80211/rx.c                | 12 +++++++++++-
 3 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 80d543902b8b..8014153bdd49 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -75,6 +75,7 @@ enum ieee80211_radiotap_presence {
 	IEEE80211_RADIOTAP_TIMESTAMP = 22,
 	IEEE80211_RADIOTAP_HE = 23,
 	IEEE80211_RADIOTAP_HE_MU = 24,
+	IEEE80211_RADIOTAP_ZERO_LEN_PSDU = 26,
 	IEEE80211_RADIOTAP_LSIG = 27,
 
 	/* valid in every it_present bitmap, even vendor namespaces */
@@ -340,6 +341,11 @@ struct ieee80211_radiotap_lsig {
 	__le16 data1, data2;
 };
 
+enum ieee80211_radiotap_zero_len_psdu_type {
+	IEEE80211_RADIOTAP_ZERO_LEN_PSDU_SOUNDING		= 0,
+	IEEE80211_RADIOTAP_ZERO_LEN_PSDU_VENDOR			= 0xff,
+};
+
 /**
  * ieee80211_get_radiotap_len - get radiotap header length
  */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8c26d2d36cbe..50bf598abdfd 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1142,6 +1142,10 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  * @RX_FLAG_RADIOTAP_HE_MU: HE MU radiotap data is present
  *	(&struct ieee80211_radiotap_he_mu)
  * @RX_FLAG_RADIOTAP_LSIG: L-SIG radiotap data is present
+ * @RX_FLAG_NO_PSDU: use the frame only for radiotap reporting, with
+ *	the "0-length PSDU" field included there.  The value for it is
+ *	in &struct ieee80211_rx_status.  Note that if this value isn't
+ *	known the frame shouldn't be reported.
  */
 enum mac80211_rx_flags {
 	RX_FLAG_MMIC_ERROR		= BIT(0),
@@ -1173,6 +1177,7 @@ enum mac80211_rx_flags {
 	RX_FLAG_RADIOTAP_HE		= BIT(26),
 	RX_FLAG_RADIOTAP_HE_MU		= BIT(27),
 	RX_FLAG_RADIOTAP_LSIG		= BIT(28),
+	RX_FLAG_NO_PSDU			= BIT(29),
 };
 
 /**
@@ -1245,6 +1250,7 @@ enum mac80211_rx_encoding {
  * @ampdu_reference: A-MPDU reference number, must be a different value for
  *	each A-MPDU but the same for each subframe within one A-MPDU
  * @ampdu_delimiter_crc: A-MPDU delimiter CRC
+ * @zero_length_psdu_type: radiotap type of the 0-length PSDU
  */
 struct ieee80211_rx_status {
 	u64 mactime;
@@ -1265,6 +1271,7 @@ struct ieee80211_rx_status {
 	u8 chains;
 	s8 chain_signal[IEEE80211_MAX_CHAINS];
 	u8 ampdu_delimiter_crc;
+	u8 zero_length_psdu_type;
 };
 
 /**
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 4376a4ce8c25..c6bfd4019d44 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -115,7 +115,8 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
 
 	if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
 			    RX_FLAG_FAILED_PLCP_CRC |
-			    RX_FLAG_ONLY_MONITOR))
+			    RX_FLAG_ONLY_MONITOR |
+			    RX_FLAG_NO_PSDU))
 		return true;
 
 	if (unlikely(skb->len < 16 + present_fcs_len + rtap_space))
@@ -189,6 +190,9 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 		BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12);
 	}
 
+	if (status->flag & RX_FLAG_NO_PSDU)
+		len += 1;
+
 	if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
 		len = ALIGN(len, 2);
 		len += 4;
@@ -642,6 +646,12 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 		pos += sizeof(he_mu);
 	}
 
+	if (status->flag & RX_FLAG_NO_PSDU) {
+		rthdr->it_present |=
+			cpu_to_le32(1 << IEEE80211_RADIOTAP_ZERO_LEN_PSDU);
+		*pos++ = status->zero_length_psdu_type;
+	}
+
 	if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
 		/* ensure 2 byte alignment */
 		while ((pos - (u8 *)rthdr) & 1)
-- 
cgit v1.2.3


From add7453ad62f05c8f1a48675bb4dfed52e6ac878 Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Wed, 5 Sep 2018 08:06:08 +0300
Subject: wireless: align to draft 11ax D3.0

Align to new 11ax draft D3.0.  Change/add new MAC and PHY capabilities
and update drivers' 11ax capabilities and mac80211's debugfs
accordingly.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 51 ++++++++++----
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  |  4 --
 drivers/net/wireless/mac80211_hwsim.c              | 18 ++---
 include/linux/ieee80211.h                          | 72 ++++++++++++--------
 net/mac80211/debugfs_sta.c                         | 77 +++++++++++++++-------
 5 files changed, 146 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
index 73969dbeb5c5..27db4a3ba1f8 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
@@ -476,30 +476,40 @@ static struct ieee80211_sband_iftype_data iwl_he_capa = {
 		.has_he = true,
 		.he_cap_elem = {
 			.mac_cap_info[0] =
-				IEEE80211_HE_MAC_CAP0_HTC_HE,
+				IEEE80211_HE_MAC_CAP0_HTC_HE |
+				IEEE80211_HE_MAC_CAP0_TWT_REQ,
 			.mac_cap_info[1] =
 				IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US |
-				IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_8,
+				IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
 			.mac_cap_info[2] =
 				IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP |
+				IEEE80211_HE_MAC_CAP2_MU_CASCADING |
 				IEEE80211_HE_MAC_CAP2_ACK_EN,
 			.mac_cap_info[3] =
-				IEEE80211_HE_MAC_CAP3_GRP_ADDR_MULTI_STA_BA_DL_MU |
-				IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_2,
-			.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
+				IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
+				IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2,
+			.mac_cap_info[4] =
+				IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU |
+				IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39,
+			.mac_cap_info[5] =
+				IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40 |
+				IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41 |
+				IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU,
 			.phy_cap_info[0] =
-				IEEE80211_HE_PHY_CAP0_DUAL_BAND |
 				IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G |
 				IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
 				IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G,
 			.phy_cap_info[1] =
+				IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
 				IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
 				IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
-				IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_MAX_NSTS,
+				IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
 			.phy_cap_info[2] =
 				IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US |
 				IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ |
-				IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ,
+				IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ |
+				IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO |
+				IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO,
 			.phy_cap_info[3] =
 				IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK |
 				IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1 |
@@ -511,18 +521,31 @@ static struct ieee80211_sband_iftype_data iwl_he_capa = {
 				IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8,
 			.phy_cap_info[5] =
 				IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2 |
-				IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2,
+				IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2 |
+				IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK |
+				IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK,
 			.phy_cap_info[6] =
+				IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU |
+				IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU |
+				IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB |
+				IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB |
+				IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB |
+				IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO |
 				IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT,
 			.phy_cap_info[7] =
 				IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR |
 				IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI |
-				IEEE80211_HE_PHY_CAP7_MAX_NC_7,
+				IEEE80211_HE_PHY_CAP7_MAX_NC_1,
 			.phy_cap_info[8] =
 				IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI |
 				IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G |
 				IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU |
-				IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU,
+				IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU |
+				IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_160_OR_80P80_MHZ,
+			.phy_cap_info[9] =
+				IEEE80211_HE_PHY_CAP9_NON_TRIGGERED_CQI_FEEDBACK |
+				IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB |
+				IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB,
 		},
 		/*
 		 * Set default Tx/Rx HE MCS NSS Support field. Indicate support
@@ -559,9 +582,11 @@ static void iwl_init_he_hw_capab(struct ieee80211_supported_band *sband,
 	/* If not 2x2, we need to indicate 1x1 in the Midamble RX Max NSTS */
 	if ((tx_chains & rx_chains) != ANT_AB) {
 		iwl_he_capa.he_cap.he_cap_elem.phy_cap_info[1] &=
-			~IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_MAX_NSTS;
+			~IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS;
 		iwl_he_capa.he_cap.he_cap_elem.phy_cap_info[2] &=
-			~IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_MAX_NSTS;
+			~IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_TX_MAX_NSTS;
+		iwl_he_capa.he_cap.he_cap_elem.phy_cap_info[7] &=
+			~IEEE80211_HE_PHY_CAP7_MAX_NC_MASK;
 	}
 }
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index b15b0d84bb7e..d46f3fbea46e 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -1978,10 +1978,6 @@ static void iwl_mvm_cfg_he_sta(struct iwl_mvm *mvm,
 			sta_ctxt_cmd.htc_flags |=
 				cpu_to_le32(IWL_HE_HTC_LINK_ADAP_BOTH);
 	}
-	if (sta->he_cap.he_cap_elem.mac_cap_info[2] &
-	    IEEE80211_HE_MAC_CAP2_UL_MU_RESP_SCHED)
-		sta_ctxt_cmd.htc_flags |=
-			cpu_to_le32(IWL_HE_HTC_UL_MU_RESP_SCHED);
 	if (sta->he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_BSR)
 		sta_ctxt_cmd.htc_flags |= cpu_to_le32(IWL_HE_HTC_BSR_SUPP);
 	if (sta->he_cap.he_cap_elem.mac_cap_info[3] &
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 1068757ec42e..f3863101af78 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2008, Jouni Malinen <j@w1.fi>
  * Copyright (c) 2011, Javier Lopez <jlopex@gmail.com>
  * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -2529,23 +2530,20 @@ static const struct ieee80211_sband_iftype_data he_capa_2ghz = {
 				IEEE80211_HE_MAC_CAP0_HTC_HE,
 			.mac_cap_info[1] =
 				IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US |
-				IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_8,
+				IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
 			.mac_cap_info[2] =
 				IEEE80211_HE_MAC_CAP2_BSR |
 				IEEE80211_HE_MAC_CAP2_MU_CASCADING |
 				IEEE80211_HE_MAC_CAP2_ACK_EN,
 			.mac_cap_info[3] =
-				IEEE80211_HE_MAC_CAP3_GRP_ADDR_MULTI_STA_BA_DL_MU |
 				IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
-				IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_2,
+				IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2,
 			.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
-			.phy_cap_info[0] =
-				IEEE80211_HE_PHY_CAP0_DUAL_BAND,
 			.phy_cap_info[1] =
 				IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
 				IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
 				IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
-				IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_MAX_NSTS,
+				IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
 			.phy_cap_info[2] =
 				IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US |
 				IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ |
@@ -2579,18 +2577,16 @@ static const struct ieee80211_sband_iftype_data he_capa_5ghz = {
 				IEEE80211_HE_MAC_CAP0_HTC_HE,
 			.mac_cap_info[1] =
 				IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US |
-				IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_8,
+				IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
 			.mac_cap_info[2] =
 				IEEE80211_HE_MAC_CAP2_BSR |
 				IEEE80211_HE_MAC_CAP2_MU_CASCADING |
 				IEEE80211_HE_MAC_CAP2_ACK_EN,
 			.mac_cap_info[3] =
-				IEEE80211_HE_MAC_CAP3_GRP_ADDR_MULTI_STA_BA_DL_MU |
 				IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
-				IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_2,
+				IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2,
 			.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
 			.phy_cap_info[0] =
-				IEEE80211_HE_PHY_CAP0_DUAL_BAND |
 				IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
 				IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
 				IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G,
@@ -2598,7 +2594,7 @@ static const struct ieee80211_sband_iftype_data he_capa_5ghz = {
 				IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
 				IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
 				IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
-				IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_MAX_NSTS,
+				IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
 			.phy_cap_info[2] =
 				IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US |
 				IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ |
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 280600a10111..c4809ad8ab46 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1555,11 +1555,11 @@ struct ieee80211_vht_operation {
  * struct ieee80211_he_cap_elem - HE capabilities element
  *
  * This structure is the "HE capabilities element" fixed fields as
- * described in P802.11ax_D2.0 section 9.4.2.237.2 and 9.4.2.237.3
+ * described in P802.11ax_D3.0 section 9.4.2.237.2 and 9.4.2.237.3
  */
 struct ieee80211_he_cap_elem {
-	u8 mac_cap_info[5];
-	u8 phy_cap_info[9];
+	u8 mac_cap_info[6];
+	u8 phy_cap_info[11];
 } __packed;
 
 #define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN	5
@@ -1738,15 +1738,15 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 #define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US		0x04
 #define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US		0x08
 #define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK		0x0c
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_1		0x00
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_2		0x10
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_3		0x20
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_4		0x30
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_5		0x40
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_6		0x50
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_7		0x60
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_8		0x70
-#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_MASK		0x70
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_1		0x00
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_2		0x10
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_3		0x20
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_4		0x30
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_5		0x40
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_6		0x50
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_7		0x60
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8		0x70
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_MASK		0x70
 
 /* Link adaptation is split between byte HE_MAC_CAP1 and
  * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE
@@ -1760,14 +1760,13 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 
 #define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION			0x01
 #define IEEE80211_HE_MAC_CAP2_ALL_ACK				0x02
-#define IEEE80211_HE_MAC_CAP2_UL_MU_RESP_SCHED			0x04
+#define IEEE80211_HE_MAC_CAP2_TRS				0x04
 #define IEEE80211_HE_MAC_CAP2_BSR				0x08
 #define IEEE80211_HE_MAC_CAP2_BCAST_TWT				0x10
 #define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP			0x20
 #define IEEE80211_HE_MAC_CAP2_MU_CASCADING			0x40
 #define IEEE80211_HE_MAC_CAP2_ACK_EN				0x80
 
-#define IEEE80211_HE_MAC_CAP3_GRP_ADDR_MULTI_STA_BA_DL_MU	0x01
 #define IEEE80211_HE_MAC_CAP3_OMI_CONTROL			0x02
 #define IEEE80211_HE_MAC_CAP3_OFDMA_RA				0x04
 
@@ -1775,25 +1774,34 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
  * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the
  * same field in the HE capabilities.
  */
-#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_USE_VHT	0x00
-#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_1		0x08
-#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_2		0x10
-#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_RESERVED	0x18
-#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_MASK		0x18
-#define IEEE80211_HE_MAC_CAP3_A_AMSDU_FRAG			0x20
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_USE_VHT	0x00
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_1		0x08
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2		0x10
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED	0x18
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK		0x18
+#define IEEE80211_HE_MAC_CAP3_AMSDU_FRAG			0x20
 #define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED			0x40
 #define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS		0x80
 
 #define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG		0x01
 #define IEEE80211_HE_MAC_CAP4_QTP				0x02
 #define IEEE80211_HE_MAC_CAP4_BQR				0x04
-#define IEEE80211_HE_MAC_CAP4_SR_RESP				0x08
+#define IEEE80211_HE_MAC_CAP4_SRP_RESP				0x08
 #define IEEE80211_HE_MAC_CAP4_NDP_FB_REP			0x10
 #define IEEE80211_HE_MAC_CAP4_OPS				0x20
 #define IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU			0x40
+/* Multi TID agg TX is split between byte #4 and #5
+ * The value is a combination of B39,B40,B41
+ */
+#define IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39		0x80
+
+#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40		0x01
+#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41		0x02
+#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECVITE_TRANSMISSION	0x04
+#define IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU			0x08
+#define IEEE80211_HE_MAC_CAP5_OM_CTRL_UL_MU_DATA_DIS_RX		0x10
 
 /* 802.11ax HE PHY capabilities */
-#define IEEE80211_HE_PHY_CAP0_DUAL_BAND					0x01
 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G		0x02
 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G	0x04
 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G		0x08
@@ -1810,10 +1818,10 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 #define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A				0x10
 #define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD			0x20
 #define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US		0x40
-/* Midamble RX Max NSTS is split between byte #2 and byte #3 */
-#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_MAX_NSTS			0x80
+/* Midamble RX/TX Max NSTS is split between byte #2 and byte #3 */
+#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS			0x80
 
-#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_MAX_NSTS			0x01
+#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_TX_MAX_NSTS			0x01
 #define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US			0x02
 #define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ			0x04
 #define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ			0x08
@@ -1914,7 +1922,19 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 #define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU			0x04
 #define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU			0x08
 #define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI		0x10
-#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_2X_AND_1XLTF			0x20
+#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_TX_2X_AND_1XLTF		0x20
+#define IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_20MHZ				0x00
+#define IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_40MHZ				0x40
+#define IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_80MHZ				0x80
+#define IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_160_OR_80P80_MHZ		0xc0
+#define IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_MASK				0xc0
+
+#define IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM		0x01
+#define IEEE80211_HE_PHY_CAP9_NON_TRIGGERED_CQI_FEEDBACK		0x02
+#define IEEE80211_HE_PHY_CAP9_TX_1024_QAM_LESS_THAN_242_TONE_RU		0x04
+#define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU		0x08
+#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB	0x10
+#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB	0x20
 
 /* 802.11ax HE TX/RX MCS NSS Support  */
 #define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS			(3)
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 95124978947f..af5185a836e5 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -522,8 +522,8 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
 
 	cap = hec->he_cap_elem.mac_cap_info;
 	p += scnprintf(p, buf_sz + buf - p,
-		       "MAC-CAP: %#.2x %#.2x %#.2x %#.2x %#.2x\n",
-		       cap[0], cap[1], cap[2], cap[3], cap[4]);
+		       "MAC-CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n",
+		       cap[0], cap[1], cap[2], cap[3], cap[4], cap[5]);
 
 #define PRINT(fmt, ...)							\
 	p += scnprintf(p, buf_sz + buf - p, "\t\t" fmt "\n",		\
@@ -563,7 +563,8 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
 			    "MIN-FRAG-SIZE-%d", UNLIMITED, "UNLIMITED");
 	PFLAG_RANGE_DEFAULT(MAC, 1, TF_MAC_PAD_DUR, 0, 8, 0,
 			    "TF-MAC-PAD-DUR-%dUS", MASK, "UNKNOWN");
-	PFLAG_RANGE(MAC, 1, MULTI_TID_AGG_QOS, 0, 1, 1, "MULTI-TID-AGG-QOS-%d");
+	PFLAG_RANGE(MAC, 1, MULTI_TID_AGG_RX_QOS, 0, 1, 1,
+		    "MULTI-TID-AGG-RX-QOS-%d");
 
 	if (cap[0] & IEEE80211_HE_MAC_CAP0_HTC_HE) {
 		switch (((cap[2] << 1) | (cap[1] >> 7)) & 0x3) {
@@ -583,52 +584,55 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
 	}
 
 	PFLAG(MAC, 2, ALL_ACK, "ALL-ACK");
-	PFLAG(MAC, 2, UL_MU_RESP_SCHED, "UL-MU-RESP-SCHED");
+	PFLAG(MAC, 2, TRS, "TRS");
 	PFLAG(MAC, 2, BSR, "BSR");
 	PFLAG(MAC, 2, BCAST_TWT, "BCAST-TWT");
 	PFLAG(MAC, 2, 32BIT_BA_BITMAP, "32BIT-BA-BITMAP");
 	PFLAG(MAC, 2, MU_CASCADING, "MU-CASCADING");
 	PFLAG(MAC, 2, ACK_EN, "ACK-EN");
 
-	PFLAG(MAC, 3, GRP_ADDR_MULTI_STA_BA_DL_MU,
-	      "GRP-ADDR-MULTI-STA-BA-DL-MU");
 	PFLAG(MAC, 3, OMI_CONTROL, "OMI-CONTROL");
 	PFLAG(MAC, 3, OFDMA_RA, "OFDMA-RA");
 
-	switch (cap[3] & IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_MASK) {
-	case IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_USE_VHT:
-		PRINT("MAX-A-AMPDU-LEN-EXP-USE-VHT");
+	switch (cap[3] & IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK) {
+	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_USE_VHT:
+		PRINT("MAX-AMPDU-LEN-EXP-USE-VHT");
 		break;
-	case IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_1:
-		PRINT("MAX-A-AMPDU-LEN-EXP-VHT-1");
+	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_1:
+		PRINT("MAX-AMPDU-LEN-EXP-VHT-1");
 		break;
-	case IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_2:
-		PRINT("MAX-A-AMPDU-LEN-EXP-VHT-2");
+	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2:
+		PRINT("MAX-AMPDU-LEN-EXP-VHT-2");
 		break;
-	case IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_RESERVED:
-		PRINT("MAX-A-AMPDU-LEN-EXP-RESERVED");
+	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED:
+		PRINT("MAX-AMPDU-LEN-EXP-RESERVED");
 		break;
 	}
 
-	PFLAG(MAC, 3, A_AMSDU_FRAG, "A-AMSDU-FRAG");
+	PFLAG(MAC, 3, AMSDU_FRAG, "AMSDU-FRAG");
 	PFLAG(MAC, 3, FLEX_TWT_SCHED, "FLEX-TWT-SCHED");
 	PFLAG(MAC, 3, RX_CTRL_FRAME_TO_MULTIBSS, "RX-CTRL-FRAME-TO-MULTIBSS");
 
 	PFLAG(MAC, 4, BSRP_BQRP_A_MPDU_AGG, "BSRP-BQRP-A-MPDU-AGG");
 	PFLAG(MAC, 4, QTP, "QTP");
 	PFLAG(MAC, 4, BQR, "BQR");
-	PFLAG(MAC, 4, SR_RESP, "SR-RESP");
+	PFLAG(MAC, 4, SRP_RESP, "SRP-RESP");
 	PFLAG(MAC, 4, NDP_FB_REP, "NDP-FB-REP");
 	PFLAG(MAC, 4, OPS, "OPS");
 	PFLAG(MAC, 4, AMDSU_IN_AMPDU, "AMSDU-IN-AMPDU");
 
+	PRINT("MULTI-TID-AGG-TX-QOS-%d", ((cap[5] << 1) | (cap[4] >> 7)) & 0x7);
+
+	PFLAG(MAC, 5, SUBCHAN_SELECVITE_TRANSMISSION,
+	      "SUBCHAN-SELECVITE-TRANSMISSION");
+	PFLAG(MAC, 5, UL_2x996_TONE_RU, "UL-2x996-TONE-RU");
+	PFLAG(MAC, 5, OM_CTRL_UL_MU_DATA_DIS_RX, "OM-CTRL-UL-MU-DATA-DIS-RX");
+
 	cap = hec->he_cap_elem.phy_cap_info;
 	p += scnprintf(p, buf_sz + buf - p,
-		       "PHY CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n",
+		       "PHY CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n",
 		       cap[0], cap[1], cap[2], cap[3], cap[4], cap[5], cap[6],
-		       cap[7], cap[8]);
-
-	PFLAG(PHY, 0, DUAL_BAND, "DUAL-BAND");
+		       cap[7], cap[8], cap[9], cap[10]);
 
 	PFLAG(PHY, 0, CHANNEL_WIDTH_SET_40MHZ_IN_2G,
 	      "CHANNEL-WIDTH-SET-40MHZ-IN-2G");
@@ -754,7 +758,36 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
 	PFLAG(PHY, 8, 80MHZ_IN_160MHZ_HE_PPDU, "80MHZ-IN-160MHZ-HE-PPDU");
 	PFLAG(PHY, 8, HE_ER_SU_1XLTF_AND_08_US_GI,
 	      "HE-ER-SU-1XLTF-AND-08-US-GI");
-	PFLAG(PHY, 8, MIDAMBLE_RX_2X_AND_1XLTF, "MIDAMBLE-RX-2X-AND-1XLTF");
+	PFLAG(PHY, 8, MIDAMBLE_RX_TX_2X_AND_1XLTF,
+	      "MIDAMBLE-RX-TX-2X-AND-1XLTF");
+
+	switch (cap[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_MASK) {
+	case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_20MHZ:
+		PRINT("DDCM-MAX-BW-20MHZ");
+		break;
+	case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_40MHZ:
+		PRINT("DCM-MAX-BW-40MHZ");
+		break;
+	case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_80MHZ:
+		PRINT("DCM-MAX-BW-80MHZ");
+		break;
+	case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_160_OR_80P80_MHZ:
+		PRINT("DCM-MAX-BW-160-OR-80P80-MHZ");
+		break;
+	}
+
+	PFLAG(PHY, 9, LONGER_THAN_16_SIGB_OFDM_SYM,
+	      "LONGER-THAN-16-SIGB-OFDM-SYM");
+	PFLAG(PHY, 9, NON_TRIGGERED_CQI_FEEDBACK,
+	      "NON-TRIGGERED-CQI-FEEDBACK");
+	PFLAG(PHY, 9, TX_1024_QAM_LESS_THAN_242_TONE_RU,
+	      "TX-1024-QAM-LESS-THAN-242-TONE-RU");
+	PFLAG(PHY, 9, RX_1024_QAM_LESS_THAN_242_TONE_RU,
+	      "RX-1024-QAM-LESS-THAN-242-TONE-RU");
+	PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB,
+	      "RX-FULL-BW-SU-USING-MU-WITH-COMP-SIGB");
+	PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB,
+	      "RX-FULL-BW-SU-USING-MU-WITH-NON-COMP-SIGB");
 
 #undef PFLAG_RANGE_DEFAULT
 #undef PFLAG_RANGE
-- 
cgit v1.2.3


From 0eeb2b674f05ccb5162a1d68c0b8ae81e25fd972 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Wed, 5 Sep 2018 08:06:09 +0300
Subject: mac80211: add an option for station management TXQ

We have a TXQ abstraction for non-data packets that need
powersave buffering. Since the AP cannot sleep, in case
of station we can use this TXQ for all management frames,
regardless if they are bufferable. Add HW flag to allow
that.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  5 +++++
 net/mac80211/debugfs.c |  1 +
 net/mac80211/tx.c      | 14 +++++++++++---
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 50bf598abdfd..fe71cec8ba42 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2152,6 +2152,10 @@ struct ieee80211_txq {
  *	but if the rate control is built-in then it must be set by the driver.
  *	See also the documentation for that flag.
  *
+ * @IEEE80211_HW_STA_MMPDU_TXQ: use the extra non-TID per-station TXQ for all
+ *	MMPDUs on station interfaces. This of course requires the driver to use
+ *	TXQs to start with.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2199,6 +2203,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP,
 	IEEE80211_HW_BUFF_MMPDU_TXQ,
 	IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW,
+	IEEE80211_HW_STA_MMPDU_TXQ,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index fb45eb5d1dc4..3fe541e358f3 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -217,6 +217,7 @@ static const char *hw_flag_names[] = {
 	FLAG(DOESNT_SUPPORT_QOS_NDP),
 	FLAG(BUFF_MMPDU_TXQ),
 	FLAG(SUPPORTS_VHT_EXT_NSS_BW),
+	FLAG(STA_MMPDU_TXQ),
 #undef FLAG
 };
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e88547842239..96b6c9b09bd2 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1251,7 +1251,8 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
 
 	if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) {
 		if ((!ieee80211_is_mgmt(hdr->frame_control) ||
-		     ieee80211_is_bufferable_mmpdu(hdr->frame_control)) &&
+		     ieee80211_is_bufferable_mmpdu(hdr->frame_control) ||
+		     vif->type == NL80211_IFTYPE_STATION) &&
 		    sta && sta->uploaded) {
 			/*
 			 * This will be NULL if the driver didn't set the
@@ -1456,9 +1457,16 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	}
 
 	if (tid == IEEE80211_NUM_TIDS) {
-		/* Drivers need to opt in to the bufferable MMPDU TXQ */
-		if (!ieee80211_hw_check(&sdata->local->hw, BUFF_MMPDU_TXQ))
+		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+			/* Drivers need to opt in to the management MPDU TXQ */
+			if (!ieee80211_hw_check(&sdata->local->hw,
+						STA_MMPDU_TXQ))
+				return;
+		} else if (!ieee80211_hw_check(&sdata->local->hw,
+					       BUFF_MMPDU_TXQ)) {
+			/* Drivers need to opt in to the bufferable MMPDU TXQ */
 			return;
+		}
 		txqi->txq.ac = IEEE80211_AC_VO;
 	} else {
 		txqi->txq.ac = ieee80211_ac_from_tid(tid);
-- 
cgit v1.2.3


From edba6bdad6fef787c0363e8a1e7d91e8d6a10129 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Wed, 5 Sep 2018 08:06:10 +0300
Subject: mac80211: allow AMSDU size limitation per-TID

Some drivers may have AMSDU size limitation per TID, due to
HW constrains. Add an option to set this limit.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 net/mac80211/tx.c      | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index fe71cec8ba42..28da9e27ea70 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1853,6 +1853,7 @@ struct ieee80211_sta_rates {
  *	unlimited.
  * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not.
  * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control.
+ * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID
  * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that
  *	the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames
  */
@@ -1894,6 +1895,7 @@ struct ieee80211_sta {
 	u16 max_amsdu_len;
 	bool support_p2p_ps;
 	u16 max_rc_amsdu_len;
+	u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS];
 
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1];
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 96b6c9b09bd2..42f44c33a133 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3225,6 +3225,10 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
 		max_amsdu_len = min_t(int, max_amsdu_len,
 				      sta->sta.max_rc_amsdu_len);
 
+	if (sta->sta.max_tid_amsdu_len[tid])
+		max_amsdu_len = min_t(int, max_amsdu_len,
+				      sta->sta.max_tid_amsdu_len[tid]);
+
 	spin_lock_bh(&fq->lock);
 
 	/* TODO: Ideally aggregation should be done on dequeue to remain
-- 
cgit v1.2.3


From 9739fe29a207ffff55361a3047e7780ebddccdb2 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Wed, 5 Sep 2018 08:06:11 +0300
Subject: mac80211: add an option for drivers to check if packets can be
 aggregated

Some hardwares have limitations on the packets' type in AMSDU.
Add an optional driver callback to determine if two skbs can
be used in the same AMSDU or not.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h    |  7 +++++++
 net/mac80211/driver-ops.h | 10 ++++++++++
 net/mac80211/tx.c         |  3 +++
 3 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 28da9e27ea70..c4fadbafbf21 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3594,6 +3594,10 @@ enum ieee80211_reconfig_type {
  * @del_nan_func: Remove a NAN function. The driver must call
  *	ieee80211_nan_func_terminated() with
  *	NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST reason code upon removal.
+ * @can_aggregate_in_amsdu: Called in order to determine if HW supports
+ *	aggregating two specific frames in the same A-MSDU. The relation
+ *	between the skbs should be symmetric and transitive. Note that while
+ *	skb is always a real frame, head may or may not be an A-MSDU.
  */
 struct ieee80211_ops {
 	void (*tx)(struct ieee80211_hw *hw,
@@ -3876,6 +3880,9 @@ struct ieee80211_ops {
 	void (*del_nan_func)(struct ieee80211_hw *hw,
 			    struct ieee80211_vif *vif,
 			    u8 instance_id);
+	bool (*can_aggregate_in_amsdu)(struct ieee80211_hw *hw,
+				       struct sk_buff *head,
+				       struct sk_buff *skb);
 };
 
 /**
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 8f6998091d26..e42c641b6190 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1173,6 +1173,16 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local,
 	local->ops->wake_tx_queue(&local->hw, &txq->txq);
 }
 
+static inline int drv_can_aggregate_in_amsdu(struct ieee80211_local *local,
+					     struct sk_buff *head,
+					     struct sk_buff *skb)
+{
+	if (!local->ops->can_aggregate_in_amsdu)
+		return true;
+
+	return local->ops->can_aggregate_in_amsdu(&local->hw, head, skb);
+}
+
 static inline int drv_start_nan(struct ieee80211_local *local,
 				struct ieee80211_sub_if_data *sdata,
 				struct cfg80211_nan_conf *conf)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 42f44c33a133..c42bfa1dcd2c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3261,6 +3261,9 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
 	if (max_frags && nfrags > max_frags)
 		goto out;
 
+	if (!drv_can_aggregate_in_amsdu(local, head, skb))
+		goto out;
+
 	if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
 		goto out;
 
-- 
cgit v1.2.3


From 83033688b7ade18d2dbbcefa810f02ff66ba549d Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Mon, 23 Jul 2018 10:55:39 +0300
Subject: net/mlx5: Change flow counters addlist type to single linked list

In order to prevent flow counters stats work function from traversing whole
flow counters tree while searching for deleted flow counters, new list to
store deleted flow counters will be added to struct mlx5_fc_stats. However,
the flow counter structure itself has no space left to store any more data
in first cache line. To free space that is needed to store additional list
node, convert current addlist double linked list (two pointers per node) to
atomic single linked list (one pointer per node).

Lockless NULL-terminated single linked list data type doesn't require any
additional external synchronization for operations used by flow counters
module (add single new element, remove all elements from list and traverse
them). Remove addlist_lock that is no longer needed.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Amir Vadai <amir@vadai.me>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  3 +-
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  | 45 ++++++++++------------
 include/linux/mlx5/driver.h                        |  4 +-
 3 files changed, 23 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 32070e5d993d..f68590291e0c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -36,6 +36,7 @@
 #include <linux/refcount.h>
 #include <linux/mlx5/fs.h>
 #include <linux/rhashtable.h>
+#include <linux/llist.h>
 
 enum fs_node_type {
 	FS_TYPE_NAMESPACE,
@@ -139,7 +140,7 @@ struct mlx5_fc_cache {
 
 struct mlx5_fc {
 	struct rb_node node;
-	struct list_head list;
+	struct llist_node addlist;
 
 	/* last{packets,bytes} members are used when calculating the delta since
 	 * last reading
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 58af6be13dfa..d996d6cf9e19 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -52,7 +52,9 @@
  * access to counter list:
  * - create (user context)
  *   - mlx5_fc_create() only adds to an addlist to be used by
- *     mlx5_fc_stats_query_work(). addlist is protected by a spinlock.
+ *     mlx5_fc_stats_query_work(). addlist is a lockless single linked list
+ *     that doesn't require any additional synchronization when adding single
+ *     node.
  *   - spawn thread to do the actual destroy
  *
  * - destroy (user context)
@@ -156,28 +158,29 @@ out:
 	return node;
 }
 
+static void mlx5_free_fc(struct mlx5_core_dev *dev,
+			 struct mlx5_fc *counter)
+{
+	mlx5_cmd_fc_free(dev, counter->id);
+	kfree(counter);
+}
+
 static void mlx5_fc_stats_work(struct work_struct *work)
 {
 	struct mlx5_core_dev *dev = container_of(work, struct mlx5_core_dev,
 						 priv.fc_stats.work.work);
 	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct llist_node *tmplist = llist_del_all(&fc_stats->addlist);
 	unsigned long now = jiffies;
 	struct mlx5_fc *counter = NULL;
 	struct mlx5_fc *last = NULL;
 	struct rb_node *node;
-	LIST_HEAD(tmplist);
-
-	spin_lock(&fc_stats->addlist_lock);
 
-	list_splice_tail_init(&fc_stats->addlist, &tmplist);
-
-	if (!list_empty(&tmplist) || !RB_EMPTY_ROOT(&fc_stats->counters))
+	if (tmplist || !RB_EMPTY_ROOT(&fc_stats->counters))
 		queue_delayed_work(fc_stats->wq, &fc_stats->work,
 				   fc_stats->sampling_interval);
 
-	spin_unlock(&fc_stats->addlist_lock);
-
-	list_for_each_entry(counter, &tmplist, list)
+	llist_for_each_entry(counter, tmplist, addlist)
 		mlx5_fc_stats_insert(&fc_stats->counters, counter);
 
 	node = rb_first(&fc_stats->counters);
@@ -229,9 +232,7 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
 		counter->cache.lastuse = jiffies;
 		counter->aging = true;
 
-		spin_lock(&fc_stats->addlist_lock);
-		list_add(&counter->list, &fc_stats->addlist);
-		spin_unlock(&fc_stats->addlist_lock);
+		llist_add(&counter->addlist, &fc_stats->addlist);
 
 		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
 	}
@@ -268,8 +269,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
 
 	fc_stats->counters = RB_ROOT;
-	INIT_LIST_HEAD(&fc_stats->addlist);
-	spin_lock_init(&fc_stats->addlist_lock);
+	init_llist_head(&fc_stats->addlist);
 
 	fc_stats->wq = create_singlethread_workqueue("mlx5_fc");
 	if (!fc_stats->wq)
@@ -284,6 +284,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 {
 	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct llist_node *tmplist;
 	struct mlx5_fc *counter;
 	struct mlx5_fc *tmp;
 	struct rb_node *node;
@@ -292,13 +293,9 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 	destroy_workqueue(dev->priv.fc_stats.wq);
 	dev->priv.fc_stats.wq = NULL;
 
-	list_for_each_entry_safe(counter, tmp, &fc_stats->addlist, list) {
-		list_del(&counter->list);
-
-		mlx5_cmd_fc_free(dev, counter->id);
-
-		kfree(counter);
-	}
+	tmplist = llist_del_all(&fc_stats->addlist);
+	llist_for_each_entry_safe(counter, tmp, tmplist, addlist)
+		mlx5_free_fc(dev, counter);
 
 	node = rb_first(&fc_stats->counters);
 	while (node) {
@@ -308,9 +305,7 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 
 		rb_erase(&counter->node, &fc_stats->counters);
 
-		mlx5_cmd_fc_free(dev, counter->id);
-
-		kfree(counter);
+		mlx5_free_fc(dev, counter);
 	}
 }
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7a452716de4b..c00549293982 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -584,9 +584,7 @@ struct mlx5_irq_info {
 
 struct mlx5_fc_stats {
 	struct rb_root counters;
-	struct list_head addlist;
-	/* protect addlist add/splice operations */
-	spinlock_t addlist_lock;
+	struct llist_head addlist;
 
 	struct workqueue_struct *wq;
 	struct delayed_work work;
-- 
cgit v1.2.3


From 6e5e22839136fdb466af0aa46ff2404713dff974 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Mon, 23 Jul 2018 11:32:05 +0300
Subject: net/mlx5: Add new list to store deleted flow counters

In order to prevent flow counters stats work function from traversing whole
flow counters tree while searching for deleted flow counters, new list to
store deleted flow counters is added to struct mlx5_fc_stats. Lockless
NULL-terminated single linked list data type is used due to following
reasons:
 - This use case only needs to add single element to list and
 remove/iterate whole list. Lockless list doesn't require any additional
 synchronization for these operations.
 - First cache line of flow counter data structure only has space to store
 single additional pointer, which precludes usage of double linked list.

Remove flow counter 'deleted' flag that is no longer needed.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Amir Vadai <amir@vadai.me>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  | 34 ++++++++--------------
 include/linux/mlx5/driver.h                        |  1 +
 3 files changed, 14 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index f68590291e0c..617d6239c5f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -141,6 +141,7 @@ struct mlx5_fc_cache {
 struct mlx5_fc {
 	struct rb_node node;
 	struct llist_node addlist;
+	struct llist_node dellist;
 
 	/* last{packets,bytes} members are used when calculating the delta since
 	 * last reading
@@ -149,7 +150,6 @@ struct mlx5_fc {
 	u64 lastbytes;
 
 	u32 id;
-	bool deleted;
 	bool aging;
 
 	struct mlx5_fc_cache cache ____cacheline_aligned_in_smp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index d996d6cf9e19..f1266f215a31 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -58,7 +58,7 @@
  *   - spawn thread to do the actual destroy
  *
  * - destroy (user context)
- *   - mark a counter as deleted
+ *   - add a counter to lockless dellist
  *   - spawn thread to do the actual del
  *
  * - dump (user context)
@@ -171,9 +171,8 @@ static void mlx5_fc_stats_work(struct work_struct *work)
 						 priv.fc_stats.work.work);
 	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
 	struct llist_node *tmplist = llist_del_all(&fc_stats->addlist);
+	struct mlx5_fc *counter = NULL, *last = NULL, *tmp;
 	unsigned long now = jiffies;
-	struct mlx5_fc *counter = NULL;
-	struct mlx5_fc *last = NULL;
 	struct rb_node *node;
 
 	if (tmplist || !RB_EMPTY_ROOT(&fc_stats->counters))
@@ -183,26 +182,17 @@ static void mlx5_fc_stats_work(struct work_struct *work)
 	llist_for_each_entry(counter, tmplist, addlist)
 		mlx5_fc_stats_insert(&fc_stats->counters, counter);
 
-	node = rb_first(&fc_stats->counters);
-	while (node) {
-		counter = rb_entry(node, struct mlx5_fc, node);
-
-		node = rb_next(node);
-
-		if (counter->deleted) {
-			rb_erase(&counter->node, &fc_stats->counters);
-
-			mlx5_cmd_fc_free(dev, counter->id);
-
-			kfree(counter);
-			continue;
-		}
+	tmplist = llist_del_all(&fc_stats->dellist);
+	llist_for_each_entry_safe(counter, tmp, tmplist, dellist) {
+		rb_erase(&counter->node, &fc_stats->counters);
 
-		last = counter;
+		mlx5_free_fc(dev, counter);
 	}
 
-	if (time_before(now, fc_stats->next_query) || !last)
+	node = rb_last(&fc_stats->counters);
+	if (time_before(now, fc_stats->next_query) || !node)
 		return;
+	last = rb_entry(node, struct mlx5_fc, node);
 
 	node = rb_first(&fc_stats->counters);
 	while (node) {
@@ -254,13 +244,12 @@ void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 		return;
 
 	if (counter->aging) {
-		counter->deleted = true;
+		llist_add(&counter->dellist, &fc_stats->dellist);
 		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
 		return;
 	}
 
-	mlx5_cmd_fc_free(dev, counter->id);
-	kfree(counter);
+	mlx5_free_fc(dev, counter);
 }
 EXPORT_SYMBOL(mlx5_fc_destroy);
 
@@ -270,6 +259,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 
 	fc_stats->counters = RB_ROOT;
 	init_llist_head(&fc_stats->addlist);
+	init_llist_head(&fc_stats->dellist);
 
 	fc_stats->wq = create_singlethread_workqueue("mlx5_fc");
 	if (!fc_stats->wq)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index c00549293982..4b53ac64004b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -585,6 +585,7 @@ struct mlx5_irq_info {
 struct mlx5_fc_stats {
 	struct rb_root counters;
 	struct llist_head addlist;
+	struct llist_head dellist;
 
 	struct workqueue_struct *wq;
 	struct delayed_work work;
-- 
cgit v1.2.3


From 9aff93d7d0d4b3f3076d7bd12a4ad06ef1cf9804 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Tue, 24 Jul 2018 09:52:11 +0300
Subject: net/mlx5: Store flow counters in a list

In order to improve performance of flow counter stats query loop that
traverses all configured flow counters, replace rb_tree with double-linked
list. This change improves performance of traversing flow counters by
removing the tree traversal. (profiling data showed that call to rb_next
was most top CPU consumer)

However, lookup of flow flow counter in list becomes linear, instead of
logarithmic. This problem is fixed by next patch in series, which adds idr
for fast lookup. Idr is to be used because it is not an intrusive data
structure and doesn't require adding any new members to struct mlx5_fc,
which allows its control data part to stay <= 1 cache line in size.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Amir Vadai <amir@vadai.me>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  | 88 ++++++++++------------
 include/linux/mlx5/driver.h                        |  2 +-
 3 files changed, 42 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 617d6239c5f3..a06f83c0c2b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -139,7 +139,7 @@ struct mlx5_fc_cache {
 };
 
 struct mlx5_fc {
-	struct rb_node node;
+	struct list_head list;
 	struct llist_node addlist;
 	struct llist_node dellist;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index f1266f215a31..90ebfee37508 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -73,36 +73,38 @@
  *   elapsed, the thread will actually query the hardware.
  */
 
-static void mlx5_fc_stats_insert(struct rb_root *root, struct mlx5_fc *counter)
+static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev,
+						      u32 id)
 {
-	struct rb_node **new = &root->rb_node;
-	struct rb_node *parent = NULL;
-
-	while (*new) {
-		struct mlx5_fc *this = rb_entry(*new, struct mlx5_fc, node);
-		int result = counter->id - this->id;
-
-		parent = *new;
-		if (result < 0)
-			new = &((*new)->rb_left);
-		else
-			new = &((*new)->rb_right);
-	}
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter;
+
+	list_for_each_entry(counter, &fc_stats->counters, list)
+		if (counter->id > id)
+			return &counter->list;
+
+	return &fc_stats->counters;
+}
+
+static void mlx5_fc_stats_insert(struct mlx5_core_dev *dev,
+				 struct mlx5_fc *counter)
+{
+	struct list_head *next = mlx5_fc_counters_lookup_next(dev, counter->id);
 
-	/* Add new node and rebalance tree. */
-	rb_link_node(&counter->node, parent, new);
-	rb_insert_color(&counter->node, root);
+	list_add_tail(&counter->list, next);
 }
 
-/* The function returns the last node that was queried so the caller
+/* The function returns the last counter that was queried so the caller
  * function can continue calling it till all counters are queried.
  */
-static struct rb_node *mlx5_fc_stats_query(struct mlx5_core_dev *dev,
+static struct mlx5_fc *mlx5_fc_stats_query(struct mlx5_core_dev *dev,
 					   struct mlx5_fc *first,
 					   u32 last_id)
 {
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter = NULL;
 	struct mlx5_cmd_fc_bulk *b;
-	struct rb_node *node = NULL;
+	bool more = false;
 	u32 afirst_id;
 	int num;
 	int err;
@@ -132,14 +134,16 @@ static struct rb_node *mlx5_fc_stats_query(struct mlx5_core_dev *dev,
 		goto out;
 	}
 
-	for (node = &first->node; node; node = rb_next(node)) {
-		struct mlx5_fc *counter = rb_entry(node, struct mlx5_fc, node);
+	counter = first;
+	list_for_each_entry_from(counter, &fc_stats->counters, list) {
 		struct mlx5_fc_cache *c = &counter->cache;
 		u64 packets;
 		u64 bytes;
 
-		if (counter->id > last_id)
+		if (counter->id > last_id) {
+			more = true;
 			break;
+		}
 
 		mlx5_cmd_fc_bulk_get(dev, b,
 				     counter->id, &packets, &bytes);
@@ -155,7 +159,7 @@ static struct rb_node *mlx5_fc_stats_query(struct mlx5_core_dev *dev,
 out:
 	mlx5_cmd_fc_bulk_free(b);
 
-	return node;
+	return more ? counter : NULL;
 }
 
 static void mlx5_free_fc(struct mlx5_core_dev *dev,
@@ -173,33 +177,30 @@ static void mlx5_fc_stats_work(struct work_struct *work)
 	struct llist_node *tmplist = llist_del_all(&fc_stats->addlist);
 	struct mlx5_fc *counter = NULL, *last = NULL, *tmp;
 	unsigned long now = jiffies;
-	struct rb_node *node;
 
-	if (tmplist || !RB_EMPTY_ROOT(&fc_stats->counters))
+	if (tmplist || !list_empty(&fc_stats->counters))
 		queue_delayed_work(fc_stats->wq, &fc_stats->work,
 				   fc_stats->sampling_interval);
 
 	llist_for_each_entry(counter, tmplist, addlist)
-		mlx5_fc_stats_insert(&fc_stats->counters, counter);
+		mlx5_fc_stats_insert(dev, counter);
 
 	tmplist = llist_del_all(&fc_stats->dellist);
 	llist_for_each_entry_safe(counter, tmp, tmplist, dellist) {
-		rb_erase(&counter->node, &fc_stats->counters);
+		list_del(&counter->list);
 
 		mlx5_free_fc(dev, counter);
 	}
 
-	node = rb_last(&fc_stats->counters);
-	if (time_before(now, fc_stats->next_query) || !node)
+	if (time_before(now, fc_stats->next_query) ||
+	    list_empty(&fc_stats->counters))
 		return;
-	last = rb_entry(node, struct mlx5_fc, node);
-
-	node = rb_first(&fc_stats->counters);
-	while (node) {
-		counter = rb_entry(node, struct mlx5_fc, node);
+	last = list_last_entry(&fc_stats->counters, struct mlx5_fc, list);
 
-		node = mlx5_fc_stats_query(dev, counter, last->id);
-	}
+	counter = list_first_entry(&fc_stats->counters, struct mlx5_fc,
+				   list);
+	while (counter)
+		counter = mlx5_fc_stats_query(dev, counter, last->id);
 
 	fc_stats->next_query = now + fc_stats->sampling_interval;
 }
@@ -257,7 +258,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 {
 	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
 
-	fc_stats->counters = RB_ROOT;
+	INIT_LIST_HEAD(&fc_stats->counters);
 	init_llist_head(&fc_stats->addlist);
 	init_llist_head(&fc_stats->dellist);
 
@@ -277,7 +278,6 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 	struct llist_node *tmplist;
 	struct mlx5_fc *counter;
 	struct mlx5_fc *tmp;
-	struct rb_node *node;
 
 	cancel_delayed_work_sync(&dev->priv.fc_stats.work);
 	destroy_workqueue(dev->priv.fc_stats.wq);
@@ -287,16 +287,8 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 	llist_for_each_entry_safe(counter, tmp, tmplist, addlist)
 		mlx5_free_fc(dev, counter);
 
-	node = rb_first(&fc_stats->counters);
-	while (node) {
-		counter = rb_entry(node, struct mlx5_fc, node);
-
-		node = rb_next(node);
-
-		rb_erase(&counter->node, &fc_stats->counters);
-
+	list_for_each_entry_safe(counter, tmp, &fc_stats->counters, list)
 		mlx5_free_fc(dev, counter);
-	}
 }
 
 int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 4b53ac64004b..61bed33e6675 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -583,7 +583,7 @@ struct mlx5_irq_info {
 };
 
 struct mlx5_fc_stats {
-	struct rb_root counters;
+	struct list_head counters;
 	struct llist_head addlist;
 	struct llist_head dellist;
 
-- 
cgit v1.2.3


From 12d6066c3b29c5606c4a2466f964fbd9ede803c5 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Tue, 24 Jul 2018 16:37:40 +0300
Subject: net/mlx5: Add flow counters idr

Previous patch in series changed flow counter storage structure from
rb_tree to linked list in order to improve flow counter traversal
performance. The drawback of such solution is that flow counter lookup by
id becomes linear in complexity.

Store pointers to flow counters in idr in order to improve lookup
performance to logarithmic again. Idr is non-intrusive data structure and
doesn't require extending flow counter struct with new elements. This means
that idr can be used for lookup, while linked list from previous patch is
used for traversal, and struct mlx5_fc size is <= 2 cache lines.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Amir Vadai <amir@vadai.me>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  | 37 +++++++++++++++++++---
 include/linux/mlx5/driver.h                        |  2 ++
 2 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 90ebfee37508..09206c4acd9a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -77,13 +77,18 @@ static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev,
 						      u32 id)
 {
 	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	unsigned long next_id = (unsigned long)id + 1;
 	struct mlx5_fc *counter;
 
-	list_for_each_entry(counter, &fc_stats->counters, list)
-		if (counter->id > id)
-			return &counter->list;
+	rcu_read_lock();
+	/* skip counters that are in idr, but not yet in counters list */
+	while ((counter = idr_get_next_ul(&fc_stats->counters_idr,
+					  &next_id)) != NULL &&
+	       list_empty(&counter->list))
+		next_id++;
+	rcu_read_unlock();
 
-	return &fc_stats->counters;
+	return counter ? &counter->list : &fc_stats->counters;
 }
 
 static void mlx5_fc_stats_insert(struct mlx5_core_dev *dev,
@@ -214,15 +219,29 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
 	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 	if (!counter)
 		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&counter->list);
 
 	err = mlx5_cmd_fc_alloc(dev, &counter->id);
 	if (err)
 		goto err_out;
 
 	if (aging) {
+		u32 id = counter->id;
+
 		counter->cache.lastuse = jiffies;
 		counter->aging = true;
 
+		idr_preload(GFP_KERNEL);
+		spin_lock(&fc_stats->counters_idr_lock);
+
+		err = idr_alloc_u32(&fc_stats->counters_idr, counter, &id, id,
+				    GFP_NOWAIT);
+
+		spin_unlock(&fc_stats->counters_idr_lock);
+		idr_preload_end();
+		if (err)
+			goto err_out_alloc;
+
 		llist_add(&counter->addlist, &fc_stats->addlist);
 
 		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
@@ -230,6 +249,8 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
 
 	return counter;
 
+err_out_alloc:
+	mlx5_cmd_fc_free(dev, counter->id);
 err_out:
 	kfree(counter);
 
@@ -245,6 +266,10 @@ void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 		return;
 
 	if (counter->aging) {
+		spin_lock(&fc_stats->counters_idr_lock);
+		WARN_ON(!idr_remove(&fc_stats->counters_idr, counter->id));
+		spin_unlock(&fc_stats->counters_idr_lock);
+
 		llist_add(&counter->dellist, &fc_stats->dellist);
 		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
 		return;
@@ -258,6 +283,8 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 {
 	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
 
+	spin_lock_init(&fc_stats->counters_idr_lock);
+	idr_init(&fc_stats->counters_idr);
 	INIT_LIST_HEAD(&fc_stats->counters);
 	init_llist_head(&fc_stats->addlist);
 	init_llist_head(&fc_stats->dellist);
@@ -283,6 +310,8 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
 	destroy_workqueue(dev->priv.fc_stats.wq);
 	dev->priv.fc_stats.wq = NULL;
 
+	idr_destroy(&fc_stats->counters_idr);
+
 	tmplist = llist_del_all(&fc_stats->addlist);
 	llist_for_each_entry_safe(counter, tmp, tmplist, addlist)
 		mlx5_free_fc(dev, counter);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 61bed33e6675..2a0c845f6bdb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -583,6 +583,8 @@ struct mlx5_irq_info {
 };
 
 struct mlx5_fc_stats {
+	spinlock_t counters_idr_lock; /* protects counters_idr */
+	struct idr counters_idr;
 	struct list_head counters;
 	struct llist_head addlist;
 	struct llist_head dellist;
-- 
cgit v1.2.3


From 64109f1dc41f25f4a9c6b114e04b6266bf4128ad Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayag@mellanox.com>
Date: Tue, 5 Jun 2018 09:22:18 +0300
Subject: net/mlx5e: Replace PTP clock lock from RW lock to seq lock

Changed "priv.clock.lock" lock from 'rw_lock' to 'seq_lock'
in order to improve packet rate performance.

Tested on Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz.
Sent 64b packets between two peers connected by ConnectX-5,
and measured packet rate for the receiver in three modes:
	no time-stamping (base rate)
	time-stamping using rw_lock (old lock) for critical region
	time-stamping using seq_lock (new lock) for critical region
Only the receiver time stamped its packets.

The measured packet rate improvements are:

	Single flow (multiple TX rings to single RX ring):
		without timestamping:	  4.26 (M packets)/sec
		with rw-lock (old lock):  4.1  (M packets)/sec
		with seq-lock (new lock): 4.16 (M packets)/sec
		1.46% improvement

	Multiple flows (multiple TX rings to six RX rings):
		without timestamping: 	  22   (M packets)/sec
		with rw-lock (old lock):  11.7 (M packets)/sec
		with seq-lock (new lock): 21.3 (M packets)/sec
		82.05% improvement

The packet rate improvement is due to the lack of atomic operations
for the 'readers' by the seq-lock.
Since there are much more 'readers' than 'writers' contention
on this lock, almost all atomic operations are saved.
this results in a dramatic decrease in overall
cache misses.

Signed-off-by: Shay Agroskin <shayag@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/lib/clock.c    | 34 +++++++++++-----------
 .../net/ethernet/mellanox/mlx5/core/lib/clock.h    |  8 +++--
 include/linux/mlx5/driver.h                        |  2 +-
 3 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
index 3f767cde4c1d..0d90b1b4a3d3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
@@ -111,10 +111,10 @@ static void mlx5_pps_out(struct work_struct *work)
 	for (i = 0; i < clock->ptp_info.n_pins; i++) {
 		u64 tstart;
 
-		write_lock_irqsave(&clock->lock, flags);
+		write_seqlock_irqsave(&clock->lock, flags);
 		tstart = clock->pps_info.start[i];
 		clock->pps_info.start[i] = 0;
-		write_unlock_irqrestore(&clock->lock, flags);
+		write_sequnlock_irqrestore(&clock->lock, flags);
 		if (!tstart)
 			continue;
 
@@ -132,10 +132,10 @@ static void mlx5_timestamp_overflow(struct work_struct *work)
 						overflow_work);
 	unsigned long flags;
 
-	write_lock_irqsave(&clock->lock, flags);
+	write_seqlock_irqsave(&clock->lock, flags);
 	timecounter_read(&clock->tc);
 	mlx5_update_clock_info_page(clock->mdev);
-	write_unlock_irqrestore(&clock->lock, flags);
+	write_sequnlock_irqrestore(&clock->lock, flags);
 	schedule_delayed_work(&clock->overflow_work, clock->overflow_period);
 }
 
@@ -147,10 +147,10 @@ static int mlx5_ptp_settime(struct ptp_clock_info *ptp,
 	u64 ns = timespec64_to_ns(ts);
 	unsigned long flags;
 
-	write_lock_irqsave(&clock->lock, flags);
+	write_seqlock_irqsave(&clock->lock, flags);
 	timecounter_init(&clock->tc, &clock->cycles, ns);
 	mlx5_update_clock_info_page(clock->mdev);
-	write_unlock_irqrestore(&clock->lock, flags);
+	write_sequnlock_irqrestore(&clock->lock, flags);
 
 	return 0;
 }
@@ -162,9 +162,9 @@ static int mlx5_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 	u64 ns;
 	unsigned long flags;
 
-	write_lock_irqsave(&clock->lock, flags);
+	write_seqlock_irqsave(&clock->lock, flags);
 	ns = timecounter_read(&clock->tc);
-	write_unlock_irqrestore(&clock->lock, flags);
+	write_sequnlock_irqrestore(&clock->lock, flags);
 
 	*ts = ns_to_timespec64(ns);
 
@@ -177,10 +177,10 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 						ptp_info);
 	unsigned long flags;
 
-	write_lock_irqsave(&clock->lock, flags);
+	write_seqlock_irqsave(&clock->lock, flags);
 	timecounter_adjtime(&clock->tc, delta);
 	mlx5_update_clock_info_page(clock->mdev);
-	write_unlock_irqrestore(&clock->lock, flags);
+	write_sequnlock_irqrestore(&clock->lock, flags);
 
 	return 0;
 }
@@ -203,12 +203,12 @@ static int mlx5_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
 	adj *= delta;
 	diff = div_u64(adj, 1000000000ULL);
 
-	write_lock_irqsave(&clock->lock, flags);
+	write_seqlock_irqsave(&clock->lock, flags);
 	timecounter_read(&clock->tc);
 	clock->cycles.mult = neg_adj ? clock->nominal_c_mult - diff :
 				       clock->nominal_c_mult + diff;
 	mlx5_update_clock_info_page(clock->mdev);
-	write_unlock_irqrestore(&clock->lock, flags);
+	write_sequnlock_irqrestore(&clock->lock, flags);
 
 	return 0;
 }
@@ -307,12 +307,12 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp,
 		ts.tv_nsec = rq->perout.start.nsec;
 		ns = timespec64_to_ns(&ts);
 		cycles_now = mlx5_read_internal_timer(mdev);
-		write_lock_irqsave(&clock->lock, flags);
+		write_seqlock_irqsave(&clock->lock, flags);
 		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
 		nsec_delta = ns - nsec_now;
 		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
 					 clock->cycles.mult);
-		write_unlock_irqrestore(&clock->lock, flags);
+		write_sequnlock_irqrestore(&clock->lock, flags);
 		time_stamp = cycles_now + cycles_delta;
 		field_select = MLX5_MTPPS_FS_PIN_MODE |
 			       MLX5_MTPPS_FS_PATTERN |
@@ -471,14 +471,14 @@ void mlx5_pps_event(struct mlx5_core_dev *mdev,
 		ts.tv_sec += 1;
 		ts.tv_nsec = 0;
 		ns = timespec64_to_ns(&ts);
-		write_lock_irqsave(&clock->lock, flags);
+		write_seqlock_irqsave(&clock->lock, flags);
 		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
 		nsec_delta = ns - nsec_now;
 		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
 					 clock->cycles.mult);
 		clock->pps_info.start[pin] = cycles_now + cycles_delta;
 		schedule_work(&clock->pps_info.out_work);
-		write_unlock_irqrestore(&clock->lock, flags);
+		write_sequnlock_irqrestore(&clock->lock, flags);
 		break;
 	default:
 		mlx5_core_err(mdev, " Unhandled event\n");
@@ -498,7 +498,7 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev)
 		mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n");
 		return;
 	}
-	rwlock_init(&clock->lock);
+	seqlock_init(&clock->lock);
 	clock->cycles.read = read_internal_timer;
 	clock->cycles.shift = MLX5_CYCLES_SHIFT;
 	clock->cycles.mult = clocksource_khz2mult(dev_freq,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
index 02e2e4575e4f..263cb6e2aeee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
@@ -46,11 +46,13 @@ static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev)
 static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock,
 						u64 timestamp)
 {
+	unsigned int seq;
 	u64 nsec;
 
-	read_lock(&clock->lock);
-	nsec = timecounter_cyc2time(&clock->tc, timestamp);
-	read_unlock(&clock->lock);
+	do {
+		seq = read_seqbegin(&clock->lock);
+		nsec = timecounter_cyc2time(&clock->tc, timestamp);
+	} while (read_seqretry(&clock->lock, seq));
 
 	return ns_to_ktime(nsec);
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2a0c845f6bdb..b7fce2c9443d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -805,7 +805,7 @@ struct mlx5_pps {
 };
 
 struct mlx5_clock {
-	rwlock_t                   lock;
+	seqlock_t                  lock;
 	struct cyclecounter        cycles;
 	struct timecounter         tc;
 	struct hwtstamp_config     hwtstamp_config;
-- 
cgit v1.2.3


From fa788d986a3aac5069378ed04697bd06f83d3488 Mon Sep 17 00:00:00 2001
From: Vincent Whitchurch <vincent.whitchurch@axis.com>
Date: Mon, 3 Sep 2018 16:23:36 +0200
Subject: packet: add sockopt to ignore outgoing packets

Currently, the only way to ignore outgoing packets on a packet socket is
via the BPF filter.  With MSG_ZEROCOPY, packets that are looped into
AF_PACKET are copied in dev_queue_xmit_nit(), and this copy happens even
if the filter run from packet_rcv() would reject them.  So the presence
of a packet socket on the interface takes away the benefits of
MSG_ZEROCOPY, even if the packet socket is not interested in outgoing
packets.  (Even when MSG_ZEROCOPY is not used, the skb is unnecessarily
cloned, but the cost for that is much lower.)

Add a socket option to allow AF_PACKET sockets to ignore outgoing
packets to solve this.  Note that the *BSDs already have something
similar: BIOCSSEESENT/BIOCSDIRECTION and BIOCSDIRFILT.

The first intended user is lldpd.

Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h      |  1 +
 include/uapi/linux/if_packet.h |  1 +
 net/core/dev.c                 |  3 +++
 net/packet/af_packet.c         | 17 +++++++++++++++++
 4 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4271f6b4e419..e2b3bd750c98 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2343,6 +2343,7 @@ static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb,
 
 struct packet_type {
 	__be16			type;	/* This is really htons(ether_type). */
+	bool			ignore_outgoing;
 	struct net_device	*dev;	/* NULL is wildcarded here	     */
 	int			(*func) (struct sk_buff *,
 					 struct net_device *,
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index 67b61d91d89b..467b654bd4c7 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -57,6 +57,7 @@ struct sockaddr_ll {
 #define PACKET_QDISC_BYPASS		20
 #define PACKET_ROLLOVER_STATS		21
 #define PACKET_FANOUT_DATA		22
+#define PACKET_IGNORE_OUTGOING		23
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
diff --git a/net/core/dev.c b/net/core/dev.c
index 82114e1111e6..ca78dc5a79a3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1969,6 +1969,9 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 	rcu_read_lock();
 again:
 	list_for_each_entry_rcu(ptype, ptype_list, list) {
+		if (ptype->ignore_outgoing)
+			continue;
+
 		/* Never send packets back to the socket
 		 * they originated from - MvS (miquels@drinkel.ow.org)
 		 */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 75c92a87e7b2..f85f67b5c1f4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3805,6 +3805,20 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
 		return fanout_set_data(po, optval, optlen);
 	}
+	case PACKET_IGNORE_OUTGOING:
+	{
+		int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+		if (val < 0 || val > 1)
+			return -EINVAL;
+
+		po->prot_hook.ignore_outgoing = !!val;
+		return 0;
+	}
 	case PACKET_TX_HAS_OFF:
 	{
 		unsigned int val;
@@ -3928,6 +3942,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 			((u32)po->fanout->flags << 24)) :
 		       0);
 		break;
+	case PACKET_IGNORE_OUTGOING:
+		val = po->prot_hook.ignore_outgoing;
+		break;
 	case PACKET_ROLLOVER_STATS:
 		if (!po->rollover)
 			return -EINVAL;
-- 
cgit v1.2.3


From c383edc42403b0bca31cbaabafd44dd58afb202f Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 4 Sep 2018 21:53:47 +0200
Subject: rtnetlink: add rtnl_get_net_ns_capable()

get_target_net() will be used in follow-up patches in ipv{4,6} codepaths to
retrieve network namespaces based on network namespace identifiers. So
remove the static declaration and export in the rtnetlink header. Also,
rename it to rtnl_get_net_ns_capable() to make it obvious what this
function is doing.
Export rtnl_get_net_ns_capable() so it can be used when ipv6 is built as
a module.

Signed-off-by: Christian Brauner <christian@brauner.io>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h |  1 +
 net/core/rtnetlink.c    | 17 +++++++++++++----
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 0bbaa5488423..cf26e5aacac4 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -165,6 +165,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm);
 
 int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
 			struct netlink_ext_ack *exterr);
+struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid);
 
 #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind)
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 60c928894a78..c49097237c30 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1845,7 +1845,15 @@ static bool link_dump_filtered(struct net_device *dev,
 	return false;
 }
 
-static struct net *get_target_net(struct sock *sk, int netnsid)
+/**
+ * rtnl_get_net_ns_capable - Get netns if sufficiently privileged.
+ * @sk: netlink socket
+ * @netnsid: network namespace identifier
+ *
+ * Returns the network namespace identified by netnsid on success or an error
+ * pointer on failure.
+ */
+struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid)
 {
 	struct net *net;
 
@@ -1862,6 +1870,7 @@ static struct net *get_target_net(struct sock *sk, int netnsid)
 	}
 	return net;
 }
+EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable);
 
 static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -1897,7 +1906,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 			ifla_policy, NULL) >= 0) {
 		if (tb[IFLA_IF_NETNSID]) {
 			netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-			tgt_net = get_target_net(skb->sk, netnsid);
+			tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
 			if (IS_ERR(tgt_net)) {
 				tgt_net = net;
 				netnsid = -1;
@@ -2765,7 +2774,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	if (tb[IFLA_IF_NETNSID]) {
 		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-		tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+		tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
 		if (IS_ERR(tgt_net))
 			return PTR_ERR(tgt_net);
 	}
@@ -3175,7 +3184,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	if (tb[IFLA_IF_NETNSID]) {
 		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-		tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+		tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
 		if (IS_ERR(tgt_net))
 			return PTR_ERR(tgt_net);
 	}
-- 
cgit v1.2.3


From 9f3c057c146fce335c160e95ca893d5bc34e7d00 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 4 Sep 2018 21:53:48 +0200
Subject: if_addr: add IFA_TARGET_NETNSID

This adds a new IFA_TARGET_NETNSID property to be used by address
families such as PF_INET and PF_INET6.
The IFA_TARGET_NETNSID property can be used to send a network namespace
identifier as part of a request. If a IFA_TARGET_NETNSID property is
identified it will be used to retrieve the target network namespace in
which the request is to be made.

Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Jiri Benc <jbenc@redhat.com>
Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_addr.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index ebaf5701c9db..dfcf3ce0097f 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -34,6 +34,7 @@ enum {
 	IFA_MULTICAST,
 	IFA_FLAGS,
 	IFA_RT_PRIORITY,  /* u32, priority/metric for prefix route */
+	IFA_TARGET_NETNSID,
 	__IFA_MAX,
 };
 
-- 
cgit v1.2.3


From 19d8f1ad12fd746e60707a58d954980013c7a35a Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 4 Sep 2018 21:53:52 +0200
Subject: if_link: add IFLA_TARGET_NETNSID alias

This adds IFLA_TARGET_NETNSID as an alias for IFLA_IF_NETNSID for
RTM_*LINK requests.
The new name is clearer and also aligns with the newly introduced
IFA_TARGET_NETNSID propert for RTM_*ADDR requests.

Signed-off-by: Christian Brauner <christian@brauner.io>
Suggested-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Cc: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h       | 1 +
 tools/include/uapi/linux/if_link.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 43391e2d1153..29d49b989acd 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -161,6 +161,7 @@ enum {
 	IFLA_EVENT,
 	IFLA_NEW_NETNSID,
 	IFLA_IF_NETNSID,
+	IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */
 	IFLA_CARRIER_UP_COUNT,
 	IFLA_CARRIER_DOWN_COUNT,
 	IFLA_NEW_IFINDEX,
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index cf01b6824244..1c73d63068b1 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -161,6 +161,7 @@ enum {
 	IFLA_EVENT,
 	IFLA_NEW_NETNSID,
 	IFLA_IF_NETNSID,
+	IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */
 	IFLA_CARRIER_UP_COUNT,
 	IFLA_CARRIER_DOWN_COUNT,
 	IFLA_NEW_IFINDEX,
-- 
cgit v1.2.3


From a3f723079df85eafc10c628dabdfcf374b8e1523 Mon Sep 17 00:00:00 2001
From: Denis Bolotin <denis.bolotin@cavium.com>
Date: Wed, 5 Sep 2018 18:35:55 +0300
Subject: qed*: Utilize FW 8.37.7.0

This patch adds a new qed firmware with fixes and support for new features.

Fixes:
- Fix a rare case of device crash with iWARP, iSCSI or FCoE offload.
- Fix GRE tunneled traffic when iWARP offload is enabled.
- Fix RoCE failure in ib_send_bw when using inline data.
- Fix latency optimization flow for inline WQEs.
- BigBear 100G fix

RDMA:
- Reduce task context size.
- Application page sizes above 2GB support.
- Performance improvements.

ETH:
- Tenant DCB support.
- Replace RSS indirection table update interface.

Misc:
- Debug Tools changes.

Signed-off-by: Denis Bolotin <denis.bolotin@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h       |   1 +
 drivers/net/ethernet/qlogic/qed/qed_debug.c | 248 +++++++++++++----------
 drivers/net/ethernet/qlogic/qed/qed_dev.c   |  11 ++
 drivers/net/ethernet/qlogic/qed/qed_hsi.h   | 297 +++++++++++++++++++---------
 include/linux/qed/common_hsi.h              |  10 +-
 include/linux/qed/iscsi_common.h            |   2 +-
 6 files changed, 367 insertions(+), 202 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index a60e1c8d470a..5f0962d353ce 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -623,6 +623,7 @@ struct qed_hwfn {
 	void				*unzip_buf;
 
 	struct dbg_tools_data		dbg_info;
+	void				*dbg_user_info;
 
 	/* PWM region specific data */
 	u16				wid_count;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
index 1aa9fc1c5890..78a638ec7c0a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_debug.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -3454,7 +3454,8 @@ static u32 qed_grc_dump_iors(struct qed_hwfn *p_hwfn,
 			addr = BYTES_TO_DWORDS(storm->sem_fast_mem_addr +
 					       SEM_FAST_REG_STORM_REG_FILE) +
 			       IOR_SET_OFFSET(set_id);
-			buf[strlen(buf) - 1] = '0' + set_id;
+			if (strlen(buf) > 0)
+				buf[strlen(buf) - 1] = '0' + set_id;
 			offset += qed_grc_dump_mem(p_hwfn,
 						   p_ptt,
 						   dump_buf + offset,
@@ -5563,35 +5564,6 @@ struct block_info {
 	enum block_id id;
 };
 
-struct mcp_trace_format {
-	u32 data;
-#define MCP_TRACE_FORMAT_MODULE_MASK	0x0000ffff
-#define MCP_TRACE_FORMAT_MODULE_SHIFT	0
-#define MCP_TRACE_FORMAT_LEVEL_MASK	0x00030000
-#define MCP_TRACE_FORMAT_LEVEL_SHIFT	16
-#define MCP_TRACE_FORMAT_P1_SIZE_MASK	0x000c0000
-#define MCP_TRACE_FORMAT_P1_SIZE_SHIFT	18
-#define MCP_TRACE_FORMAT_P2_SIZE_MASK	0x00300000
-#define MCP_TRACE_FORMAT_P2_SIZE_SHIFT	20
-#define MCP_TRACE_FORMAT_P3_SIZE_MASK	0x00c00000
-#define MCP_TRACE_FORMAT_P3_SIZE_SHIFT	22
-#define MCP_TRACE_FORMAT_LEN_MASK	0xff000000
-#define MCP_TRACE_FORMAT_LEN_SHIFT	24
-
-	char *format_str;
-};
-
-/* Meta data structure, generated by a perl script during MFW build. therefore,
- * the structs mcp_trace_meta and mcp_trace_format are duplicated in the perl
- * script.
- */
-struct mcp_trace_meta {
-	u32 modules_num;
-	char **modules;
-	u32 formats_num;
-	struct mcp_trace_format *formats;
-};
-
 /* REG fifo element */
 struct reg_fifo_element {
 	u64 data;
@@ -5714,6 +5686,20 @@ struct igu_fifo_addr_data {
 	enum igu_fifo_addr_types type;
 };
 
+struct mcp_trace_meta {
+	u32 modules_num;
+	char **modules;
+	u32 formats_num;
+	struct mcp_trace_format *formats;
+	bool is_allocated;
+};
+
+/* Debug Tools user data */
+struct dbg_tools_user_data {
+	struct mcp_trace_meta mcp_trace_meta;
+	const u32 *mcp_trace_user_meta_buf;
+};
+
 /******************************** Constants **********************************/
 
 #define MAX_MSG_LEN				1024
@@ -6137,15 +6123,6 @@ static const struct igu_fifo_addr_data s_igu_fifo_addr_data[] = {
 
 /******************************** Variables **********************************/
 
-/* MCP Trace meta data array - used in case the dump doesn't contain the
- * meta data (e.g. due to no NVRAM access).
- */
-static struct user_dbg_array s_mcp_trace_meta_arr = { NULL, 0 };
-
-/* Parsed MCP Trace meta data info, based on MCP trace meta array */
-static struct mcp_trace_meta s_mcp_trace_meta;
-static bool s_mcp_trace_meta_valid;
-
 /* Temporary buffer, used for print size calculations */
 static char s_temp_buf[MAX_MSG_LEN];
 
@@ -6311,6 +6288,12 @@ static u32 qed_print_section_params(u32 *dump_buf,
 	return dump_offset;
 }
 
+static struct dbg_tools_user_data *
+qed_dbg_get_user_data(struct qed_hwfn *p_hwfn)
+{
+	return (struct dbg_tools_user_data *)p_hwfn->dbg_user_info;
+}
+
 /* Parses the idle check rules and returns the number of characters printed.
  * In case of parsing error, returns 0.
  */
@@ -6570,43 +6553,26 @@ static enum dbg_status qed_parse_idle_chk_dump(u32 *dump_buf,
 	return DBG_STATUS_OK;
 }
 
-/* Frees the specified MCP Trace meta data */
-static void qed_mcp_trace_free_meta(struct qed_hwfn *p_hwfn,
-				    struct mcp_trace_meta *meta)
-{
-	u32 i;
-
-	s_mcp_trace_meta_valid = false;
-
-	/* Release modules */
-	if (meta->modules) {
-		for (i = 0; i < meta->modules_num; i++)
-			kfree(meta->modules[i]);
-		kfree(meta->modules);
-	}
-
-	/* Release formats */
-	if (meta->formats) {
-		for (i = 0; i < meta->formats_num; i++)
-			kfree(meta->formats[i].format_str);
-		kfree(meta->formats);
-	}
-}
-
 /* Allocates and fills MCP Trace meta data based on the specified meta data
  * dump buffer.
  * Returns debug status code.
  */
-static enum dbg_status qed_mcp_trace_alloc_meta(struct qed_hwfn *p_hwfn,
-						const u32 *meta_buf,
-						struct mcp_trace_meta *meta)
+static enum dbg_status
+qed_mcp_trace_alloc_meta_data(struct qed_hwfn *p_hwfn,
+			      const u32 *meta_buf)
 {
-	u8 *meta_buf_bytes = (u8 *)meta_buf;
+	struct dbg_tools_user_data *dev_user_data;
 	u32 offset = 0, signature, i;
+	struct mcp_trace_meta *meta;
+	u8 *meta_buf_bytes;
+
+	dev_user_data = qed_dbg_get_user_data(p_hwfn);
+	meta = &dev_user_data->mcp_trace_meta;
+	meta_buf_bytes = (u8 *)meta_buf;
 
 	/* Free the previous meta before loading a new one. */
-	if (s_mcp_trace_meta_valid)
-		qed_mcp_trace_free_meta(p_hwfn, meta);
+	if (meta->is_allocated)
+		qed_mcp_trace_free_meta_data(p_hwfn);
 
 	memset(meta, 0, sizeof(*meta));
 
@@ -6674,7 +6640,7 @@ static enum dbg_status qed_mcp_trace_alloc_meta(struct qed_hwfn *p_hwfn,
 				      format_len, format_ptr->format_str);
 	}
 
-	s_mcp_trace_meta_valid = true;
+	meta->is_allocated = true;
 	return DBG_STATUS_OK;
 }
 
@@ -6687,21 +6653,26 @@ static enum dbg_status qed_mcp_trace_alloc_meta(struct qed_hwfn *p_hwfn,
  *               buffer.
  * data_size - size in bytes of data to parse.
  * parsed_buf - destination buffer for parsed data.
- * parsed_bytes - size of parsed data in bytes.
+ * parsed_results_bytes - size of parsed data in bytes.
  */
-static enum dbg_status qed_parse_mcp_trace_buf(u8 *trace_buf,
+static enum dbg_status qed_parse_mcp_trace_buf(struct qed_hwfn *p_hwfn,
+					       u8 *trace_buf,
 					       u32 trace_buf_size,
 					       u32 data_offset,
 					       u32 data_size,
 					       char *parsed_buf,
-					       u32 *parsed_bytes)
+					       u32 *parsed_results_bytes)
 {
+	struct dbg_tools_user_data *dev_user_data;
+	struct mcp_trace_meta *meta;
 	u32 param_mask, param_shift;
 	enum dbg_status status;
 
-	*parsed_bytes = 0;
+	dev_user_data = qed_dbg_get_user_data(p_hwfn);
+	meta = &dev_user_data->mcp_trace_meta;
+	*parsed_results_bytes = 0;
 
-	if (!s_mcp_trace_meta_valid)
+	if (!meta->is_allocated)
 		return DBG_STATUS_MCP_TRACE_BAD_DATA;
 
 	status = DBG_STATUS_OK;
@@ -6723,7 +6694,7 @@ static enum dbg_status qed_parse_mcp_trace_buf(u8 *trace_buf,
 		format_idx = header & MFW_TRACE_EVENTID_MASK;
 
 		/* Skip message if its index doesn't exist in the meta data */
-		if (format_idx >= s_mcp_trace_meta.formats_num) {
+		if (format_idx >= meta->formats_num) {
 			u8 format_size =
 				(u8)((header & MFW_TRACE_PRM_SIZE_MASK) >>
 				     MFW_TRACE_PRM_SIZE_SHIFT);
@@ -6738,7 +6709,7 @@ static enum dbg_status qed_parse_mcp_trace_buf(u8 *trace_buf,
 			continue;
 		}
 
-		format_ptr = &s_mcp_trace_meta.formats[format_idx];
+		format_ptr = &meta->formats[format_idx];
 
 		for (i = 0,
 		     param_mask = MCP_TRACE_FORMAT_P1_SIZE_MASK,
@@ -6783,19 +6754,20 @@ static enum dbg_status qed_parse_mcp_trace_buf(u8 *trace_buf,
 			return DBG_STATUS_MCP_TRACE_BAD_DATA;
 
 		/* Print current message to results buffer */
-		*parsed_bytes +=
-			sprintf(qed_get_buf_ptr(parsed_buf, *parsed_bytes),
+		*parsed_results_bytes +=
+			sprintf(qed_get_buf_ptr(parsed_buf,
+						*parsed_results_bytes),
 				"%s %-8s: ",
 				s_mcp_trace_level_str[format_level],
-				s_mcp_trace_meta.modules[format_module]);
-		*parsed_bytes +=
-		    sprintf(qed_get_buf_ptr(parsed_buf, *parsed_bytes),
+				meta->modules[format_module]);
+		*parsed_results_bytes +=
+		    sprintf(qed_get_buf_ptr(parsed_buf, *parsed_results_bytes),
 			    format_ptr->format_str,
 			    params[0], params[1], params[2]);
 	}
 
 	/* Add string NULL terminator */
-	(*parsed_bytes)++;
+	(*parsed_results_bytes)++;
 
 	return status;
 }
@@ -6803,24 +6775,25 @@ static enum dbg_status qed_parse_mcp_trace_buf(u8 *trace_buf,
 /* Parses an MCP Trace dump buffer.
  * If result_buf is not NULL, the MCP Trace results are printed to it.
  * In any case, the required results buffer size is assigned to
- * parsed_bytes.
+ * parsed_results_bytes.
  * The parsing status is returned.
  */
 static enum dbg_status qed_parse_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 						u32 *dump_buf,
-						char *parsed_buf,
-						u32 *parsed_bytes)
+						char *results_buf,
+						u32 *parsed_results_bytes,
+						bool free_meta_data)
 {
 	const char *section_name, *param_name, *param_str_val;
 	u32 data_size, trace_data_dwords, trace_meta_dwords;
-	u32 offset, results_offset, parsed_buf_bytes;
+	u32 offset, results_offset, results_buf_bytes;
 	u32 param_num_val, num_section_params;
 	struct mcp_trace *trace;
 	enum dbg_status status;
 	const u32 *meta_buf;
 	u8 *trace_buf;
 
-	*parsed_bytes = 0;
+	*parsed_results_bytes = 0;
 
 	/* Read global_params section */
 	dump_buf += qed_read_section_hdr(dump_buf,
@@ -6831,7 +6804,7 @@ static enum dbg_status qed_parse_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 	/* Print global params */
 	dump_buf += qed_print_section_params(dump_buf,
 					     num_section_params,
-					     parsed_buf, &results_offset);
+					     results_buf, &results_offset);
 
 	/* Read trace_data section */
 	dump_buf += qed_read_section_hdr(dump_buf,
@@ -6846,6 +6819,9 @@ static enum dbg_status qed_parse_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 
 	/* Prepare trace info */
 	trace = (struct mcp_trace *)dump_buf;
+	if (trace->signature != MFW_TRACE_SIGNATURE || !trace->size)
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+
 	trace_buf = (u8 *)dump_buf + sizeof(*trace);
 	offset = trace->trace_oldest;
 	data_size = qed_cyclic_sub(trace->trace_prod, offset, trace->size);
@@ -6865,31 +6841,39 @@ static enum dbg_status qed_parse_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 	/* Choose meta data buffer */
 	if (!trace_meta_dwords) {
 		/* Dump doesn't include meta data */
-		if (!s_mcp_trace_meta_arr.ptr)
+		struct dbg_tools_user_data *dev_user_data =
+			qed_dbg_get_user_data(p_hwfn);
+
+		if (!dev_user_data->mcp_trace_user_meta_buf)
 			return DBG_STATUS_MCP_TRACE_NO_META;
-		meta_buf = s_mcp_trace_meta_arr.ptr;
+
+		meta_buf = dev_user_data->mcp_trace_user_meta_buf;
 	} else {
 		/* Dump includes meta data */
 		meta_buf = dump_buf;
 	}
 
 	/* Allocate meta data memory */
-	status = qed_mcp_trace_alloc_meta(p_hwfn, meta_buf, &s_mcp_trace_meta);
+	status = qed_mcp_trace_alloc_meta_data(p_hwfn, meta_buf);
 	if (status != DBG_STATUS_OK)
 		return status;
 
-	status = qed_parse_mcp_trace_buf(trace_buf,
+	status = qed_parse_mcp_trace_buf(p_hwfn,
+					 trace_buf,
 					 trace->size,
 					 offset,
 					 data_size,
-					 parsed_buf ?
-					 parsed_buf + results_offset :
+					 results_buf ?
+					 results_buf + results_offset :
 					 NULL,
-					 &parsed_buf_bytes);
+					 &results_buf_bytes);
 	if (status != DBG_STATUS_OK)
 		return status;
 
-	*parsed_bytes = results_offset + parsed_buf_bytes;
+	if (free_meta_data)
+		qed_mcp_trace_free_meta_data(p_hwfn);
+
+	*parsed_results_bytes = results_offset + results_buf_bytes;
 
 	return DBG_STATUS_OK;
 }
@@ -7361,6 +7345,16 @@ enum dbg_status qed_dbg_user_set_bin_ptr(const u8 * const bin_ptr)
 	return DBG_STATUS_OK;
 }
 
+enum dbg_status qed_dbg_alloc_user_data(struct qed_hwfn *p_hwfn)
+{
+	p_hwfn->dbg_user_info = kzalloc(sizeof(struct dbg_tools_user_data),
+					GFP_KERNEL);
+	if (!p_hwfn->dbg_user_info)
+		return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+
+	return DBG_STATUS_OK;
+}
+
 const char *qed_dbg_get_status_str(enum dbg_status status)
 {
 	return (status <
@@ -7397,10 +7391,13 @@ enum dbg_status qed_print_idle_chk_results(struct qed_hwfn *p_hwfn,
 				       num_errors, num_warnings);
 }
 
-void qed_dbg_mcp_trace_set_meta_data(u32 *data, u32 size)
+void qed_dbg_mcp_trace_set_meta_data(struct qed_hwfn *p_hwfn,
+				     const u32 *meta_buf)
 {
-	s_mcp_trace_meta_arr.ptr = data;
-	s_mcp_trace_meta_arr.size_in_dwords = size;
+	struct dbg_tools_user_data *dev_user_data =
+		qed_dbg_get_user_data(p_hwfn);
+
+	dev_user_data->mcp_trace_user_meta_buf = meta_buf;
 }
 
 enum dbg_status qed_get_mcp_trace_results_buf_size(struct qed_hwfn *p_hwfn,
@@ -7409,7 +7406,7 @@ enum dbg_status qed_get_mcp_trace_results_buf_size(struct qed_hwfn *p_hwfn,
 						   u32 *results_buf_size)
 {
 	return qed_parse_mcp_trace_dump(p_hwfn,
-					dump_buf, NULL, results_buf_size);
+					dump_buf, NULL, results_buf_size, true);
 }
 
 enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn,
@@ -7421,20 +7418,61 @@ enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn,
 
 	return qed_parse_mcp_trace_dump(p_hwfn,
 					dump_buf,
-					results_buf, &parsed_buf_size);
+					results_buf, &parsed_buf_size, true);
+}
+
+enum dbg_status qed_print_mcp_trace_results_cont(struct qed_hwfn *p_hwfn,
+						 u32 *dump_buf,
+						 char *results_buf)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_mcp_trace_dump(p_hwfn, dump_buf, results_buf,
+					&parsed_buf_size, false);
 }
 
-enum dbg_status qed_print_mcp_trace_line(u8 *dump_buf,
+enum dbg_status qed_print_mcp_trace_line(struct qed_hwfn *p_hwfn,
+					 u8 *dump_buf,
 					 u32 num_dumped_bytes,
 					 char *results_buf)
 {
-	u32 parsed_bytes;
+	u32 parsed_results_bytes;
 
-	return qed_parse_mcp_trace_buf(dump_buf,
+	return qed_parse_mcp_trace_buf(p_hwfn,
+				       dump_buf,
 				       num_dumped_bytes,
 				       0,
 				       num_dumped_bytes,
-				       results_buf, &parsed_bytes);
+				       results_buf, &parsed_results_bytes);
+}
+
+/* Frees the specified MCP Trace meta data */
+void qed_mcp_trace_free_meta_data(struct qed_hwfn *p_hwfn)
+{
+	struct dbg_tools_user_data *dev_user_data;
+	struct mcp_trace_meta *meta;
+	u32 i;
+
+	dev_user_data = qed_dbg_get_user_data(p_hwfn);
+	meta = &dev_user_data->mcp_trace_meta;
+	if (!meta->is_allocated)
+		return;
+
+	/* Release modules */
+	if (meta->modules) {
+		for (i = 0; i < meta->modules_num; i++)
+			kfree(meta->modules[i]);
+		kfree(meta->modules);
+	}
+
+	/* Release formats */
+	if (meta->formats) {
+		for (i = 0; i < meta->formats_num; i++)
+			kfree(meta->formats[i].format_str);
+		kfree(meta->formats);
+	}
+
+	meta->is_allocated = false;
 }
 
 enum dbg_status qed_get_reg_fifo_results_buf_size(struct qed_hwfn *p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 016ca8a7ec8a..128eb63ca54a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -144,6 +144,12 @@ static void qed_qm_info_free(struct qed_hwfn *p_hwfn)
 	qm_info->wfq_data = NULL;
 }
 
+static void qed_dbg_user_data_free(struct qed_hwfn *p_hwfn)
+{
+	kfree(p_hwfn->dbg_user_info);
+	p_hwfn->dbg_user_info = NULL;
+}
+
 void qed_resc_free(struct qed_dev *cdev)
 {
 	int i;
@@ -183,6 +189,7 @@ void qed_resc_free(struct qed_dev *cdev)
 		qed_l2_free(p_hwfn);
 		qed_dmae_info_free(p_hwfn);
 		qed_dcbx_info_free(p_hwfn);
+		qed_dbg_user_data_free(p_hwfn);
 	}
 }
 
@@ -1083,6 +1090,10 @@ int qed_resc_alloc(struct qed_dev *cdev)
 		rc = qed_dcbx_info_alloc(p_hwfn);
 		if (rc)
 			goto alloc_err;
+
+		rc = qed_dbg_alloc_user_data(p_hwfn);
+		if (rc)
+			goto alloc_err;
 	}
 
 	cdev->reset_stats = kzalloc(sizeof(*cdev->reset_stats), GFP_KERNEL);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 8faceb691657..21ec8091a24a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -274,7 +274,8 @@ struct core_rx_start_ramrod_data {
 	u8 mf_si_mcast_accept_all;
 	struct core_rx_action_on_error action_on_error;
 	u8 gsi_offload_flag;
-	u8 reserved[6];
+	u8 wipe_inner_vlan_pri_en;
+	u8 reserved[5];
 };
 
 /* Ramrod data for rx queue stop ramrod */
@@ -351,7 +352,8 @@ struct core_tx_start_ramrod_data {
 	__le16 pbl_size;
 	__le16 qm_pq_id;
 	u8 gsi_offload_flag;
-	u8 resrved[3];
+	u8 vport_id;
+	u8 resrved[2];
 };
 
 /* Ramrod data for tx queue stop ramrod */
@@ -914,6 +916,16 @@ struct eth_rx_rate_limit {
 	__le16 reserved1;
 };
 
+/* Update RSS indirection table entry command */
+struct eth_tstorm_rss_update_data {
+	u8 valid;
+	u8 vport_id;
+	u8 ind_table_index;
+	u8 reserved;
+	__le16 ind_table_value;
+	__le16 reserved1;
+};
+
 struct eth_ustorm_per_pf_stat {
 	struct regpair rcv_lb_ucast_bytes;
 	struct regpair rcv_lb_mcast_bytes;
@@ -1241,6 +1253,10 @@ struct rl_update_ramrod_data {
 	u8 rl_id_first;
 	u8 rl_id_last;
 	u8 rl_dc_qcn_flg;
+	u8 dcqcn_reset_alpha_on_idle;
+	u8 rl_bc_stage_th;
+	u8 rl_timer_stage_th;
+	u8 reserved1;
 	__le32 rl_bc_rate;
 	__le16 rl_max_rate;
 	__le16 rl_r_ai;
@@ -1249,7 +1265,7 @@ struct rl_update_ramrod_data {
 	__le32 dcqcn_k_us;
 	__le32 dcqcn_timeuot_us;
 	__le32 qcn_timeuot_us;
-	__le32 reserved[2];
+	__le32 reserved2;
 };
 
 /* Slowpath Element (SPQE) */
@@ -3322,6 +3338,25 @@ enum dbg_status qed_dbg_read_attn(struct qed_hwfn *p_hwfn,
 enum dbg_status qed_dbg_print_attn(struct qed_hwfn *p_hwfn,
 				   struct dbg_attn_block_result *results);
 
+/******************************* Data Types **********************************/
+
+struct mcp_trace_format {
+	u32 data;
+#define MCP_TRACE_FORMAT_MODULE_MASK	0x0000ffff
+#define MCP_TRACE_FORMAT_MODULE_SHIFT	0
+#define MCP_TRACE_FORMAT_LEVEL_MASK	0x00030000
+#define MCP_TRACE_FORMAT_LEVEL_SHIFT	16
+#define MCP_TRACE_FORMAT_P1_SIZE_MASK	0x000c0000
+#define MCP_TRACE_FORMAT_P1_SIZE_SHIFT	18
+#define MCP_TRACE_FORMAT_P2_SIZE_MASK	0x00300000
+#define MCP_TRACE_FORMAT_P2_SIZE_SHIFT	20
+#define MCP_TRACE_FORMAT_P3_SIZE_MASK	0x00c00000
+#define MCP_TRACE_FORMAT_P3_SIZE_SHIFT	22
+#define MCP_TRACE_FORMAT_LEN_MASK	0xff000000
+#define MCP_TRACE_FORMAT_LEN_SHIFT	24
+	char *format_str;
+};
+
 /******************************** Constants **********************************/
 
 #define MAX_NAME_LEN	16
@@ -3336,6 +3371,13 @@ enum dbg_status qed_dbg_print_attn(struct qed_hwfn *p_hwfn,
  */
 enum dbg_status qed_dbg_user_set_bin_ptr(const u8 * const bin_ptr);
 
+/**
+ * @brief qed_dbg_alloc_user_data - Allocates user debug data.
+ *
+ * @param p_hwfn -		 HW device data
+ */
+enum dbg_status qed_dbg_alloc_user_data(struct qed_hwfn *p_hwfn);
+
 /**
  * @brief qed_dbg_get_status_str - Returns a string for the specified status.
  *
@@ -3381,8 +3423,7 @@ enum dbg_status qed_print_idle_chk_results(struct qed_hwfn *p_hwfn,
 					   u32 *num_warnings);
 
 /**
- * @brief qed_dbg_mcp_trace_set_meta_data - Sets a pointer to the MCP Trace
- *	meta data.
+ * @brief qed_dbg_mcp_trace_set_meta_data - Sets the MCP Trace meta data.
  *
  * Needed in case the MCP Trace dump doesn't contain the meta data (e.g. due to
  * no NVRAM access).
@@ -3390,7 +3431,8 @@ enum dbg_status qed_print_idle_chk_results(struct qed_hwfn *p_hwfn,
  * @param data - pointer to MCP Trace meta data
  * @param size - size of MCP Trace meta data in dwords
  */
-void qed_dbg_mcp_trace_set_meta_data(u32 *data, u32 size);
+void qed_dbg_mcp_trace_set_meta_data(struct qed_hwfn *p_hwfn,
+				     const u32 *meta_buf);
 
 /**
  * @brief qed_get_mcp_trace_results_buf_size - Returns the required buffer size
@@ -3424,19 +3466,45 @@ enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn,
 					    u32 num_dumped_dwords,
 					    char *results_buf);
 
+/**
+ * @brief qed_print_mcp_trace_results_cont - Prints MCP Trace results, and
+ * keeps the MCP trace meta data allocated, to support continuous MCP Trace
+ * parsing. After the continuous parsing ends, mcp_trace_free_meta_data should
+ * be called to free the meta data.
+ *
+ * @param p_hwfn -	      HW device data
+ * @param dump_buf -	      mcp trace dump buffer, starting from the header.
+ * @param results_buf -	      buffer for printing the mcp trace results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_mcp_trace_results_cont(struct qed_hwfn *p_hwfn,
+						 u32 *dump_buf,
+						 char *results_buf);
+
 /**
  * @brief print_mcp_trace_line - Prints MCP Trace results for a single line
  *
+ * @param p_hwfn -	      HW device data
  * @param dump_buf -	      mcp trace dump buffer, starting from the header.
  * @param num_dumped_bytes -  number of bytes that were dumped.
  * @param results_buf -	      buffer for printing the mcp trace results.
  *
  * @return error if the parsing fails, ok otherwise.
  */
-enum dbg_status qed_print_mcp_trace_line(u8 *dump_buf,
+enum dbg_status qed_print_mcp_trace_line(struct qed_hwfn *p_hwfn,
+					 u8 *dump_buf,
 					 u32 num_dumped_bytes,
 					 char *results_buf);
 
+/**
+ * @brief mcp_trace_free_meta_data - Frees the MCP Trace meta data.
+ * Should be called after continuous MCP Trace parsing.
+ *
+ * @param p_hwfn - HW device data
+ */
+void qed_mcp_trace_free_meta_data(struct qed_hwfn *p_hwfn);
+
 /**
  * @brief qed_get_reg_fifo_results_buf_size - Returns the required buffer size
  *	for reg_fifo results (in bytes).
@@ -4303,154 +4371,161 @@ void qed_set_rdma_error_level(struct qed_hwfn *p_hwfn,
 	(IRO[29].base + ((pf_id) * IRO[29].m1))
 #define ETH_RX_RATE_LIMIT_SIZE				(IRO[29].size)
 
+/* RSS indirection table entry update command per PF offset in TSTORM PF BAR0.
+ * Use eth_tstorm_rss_update_data for update.
+ */
+#define TSTORM_ETH_RSS_UPDATE_OFFSET(pf_id) \
+	(IRO[30].base + ((pf_id) * IRO[30].m1))
+#define TSTORM_ETH_RSS_UPDATE_SIZE			(IRO[30].size)
+
 /* Xstorm queue zone */
 #define XSTORM_ETH_QUEUE_ZONE_OFFSET(queue_id) \
-	(IRO[30].base + ((queue_id) * IRO[30].m1))
-#define XSTORM_ETH_QUEUE_ZONE_SIZE			(IRO[30].size)
+	(IRO[31].base + ((queue_id) * IRO[31].m1))
+#define XSTORM_ETH_QUEUE_ZONE_SIZE			(IRO[31].size)
 
 /* Ystorm cqe producer */
 #define YSTORM_TOE_CQ_PROD_OFFSET(rss_id) \
-	(IRO[31].base + ((rss_id) * IRO[31].m1))
-#define YSTORM_TOE_CQ_PROD_SIZE				(IRO[31].size)
+	(IRO[32].base + ((rss_id) * IRO[32].m1))
+#define YSTORM_TOE_CQ_PROD_SIZE				(IRO[32].size)
 
 /* Ustorm cqe producer */
 #define USTORM_TOE_CQ_PROD_OFFSET(rss_id) \
-	(IRO[32].base + ((rss_id) * IRO[32].m1))
-#define USTORM_TOE_CQ_PROD_SIZE				(IRO[32].size)
+	(IRO[33].base + ((rss_id) * IRO[33].m1))
+#define USTORM_TOE_CQ_PROD_SIZE				(IRO[33].size)
 
 /* Ustorm grq producer */
 #define USTORM_TOE_GRQ_PROD_OFFSET(pf_id) \
-	(IRO[33].base + ((pf_id) * IRO[33].m1))
-#define USTORM_TOE_GRQ_PROD_SIZE			(IRO[33].size)
+	(IRO[34].base + ((pf_id) * IRO[34].m1))
+#define USTORM_TOE_GRQ_PROD_SIZE			(IRO[34].size)
 
 /* Tstorm cmdq-cons of given command queue-id */
 #define TSTORM_SCSI_CMDQ_CONS_OFFSET(cmdq_queue_id) \
-	(IRO[34].base + ((cmdq_queue_id) * IRO[34].m1))
-#define TSTORM_SCSI_CMDQ_CONS_SIZE			(IRO[34].size)
+	(IRO[35].base + ((cmdq_queue_id) * IRO[35].m1))
+#define TSTORM_SCSI_CMDQ_CONS_SIZE			(IRO[35].size)
 
 /* Tstorm (reflects M-Storm) bdq-external-producer of given function ID,
  * BDqueue-id.
  */
 #define TSTORM_SCSI_BDQ_EXT_PROD_OFFSET(func_id, bdq_id) \
-	(IRO[35].base + ((func_id) * IRO[35].m1) + ((bdq_id) * IRO[35].m2))
-#define TSTORM_SCSI_BDQ_EXT_PROD_SIZE			(IRO[35].size)
+	(IRO[36].base + ((func_id) * IRO[36].m1) + ((bdq_id) * IRO[36].m2))
+#define TSTORM_SCSI_BDQ_EXT_PROD_SIZE			(IRO[36].size)
 
 /* Mstorm bdq-external-producer of given BDQ resource ID, BDqueue-id */
 #define MSTORM_SCSI_BDQ_EXT_PROD_OFFSET(func_id, bdq_id) \
-	(IRO[36].base + ((func_id) * IRO[36].m1) + ((bdq_id) * IRO[36].m2))
-#define MSTORM_SCSI_BDQ_EXT_PROD_SIZE			(IRO[36].size)
+	(IRO[37].base + ((func_id) * IRO[37].m1) + ((bdq_id) * IRO[37].m2))
+#define MSTORM_SCSI_BDQ_EXT_PROD_SIZE			(IRO[37].size)
 
 /* Tstorm iSCSI RX stats */
 #define TSTORM_ISCSI_RX_STATS_OFFSET(pf_id) \
-	(IRO[37].base + ((pf_id) * IRO[37].m1))
-#define TSTORM_ISCSI_RX_STATS_SIZE			(IRO[37].size)
+	(IRO[38].base + ((pf_id) * IRO[38].m1))
+#define TSTORM_ISCSI_RX_STATS_SIZE			(IRO[38].size)
 
 /* Mstorm iSCSI RX stats */
 #define MSTORM_ISCSI_RX_STATS_OFFSET(pf_id) \
-	(IRO[38].base + ((pf_id) * IRO[38].m1))
-#define MSTORM_ISCSI_RX_STATS_SIZE			(IRO[38].size)
+	(IRO[39].base + ((pf_id) * IRO[39].m1))
+#define MSTORM_ISCSI_RX_STATS_SIZE			(IRO[39].size)
 
 /* Ustorm iSCSI RX stats */
 #define USTORM_ISCSI_RX_STATS_OFFSET(pf_id) \
-	(IRO[39].base + ((pf_id) * IRO[39].m1))
-#define USTORM_ISCSI_RX_STATS_SIZE			(IRO[39].size)
+	(IRO[40].base + ((pf_id) * IRO[40].m1))
+#define USTORM_ISCSI_RX_STATS_SIZE			(IRO[40].size)
 
 /* Xstorm iSCSI TX stats */
 #define XSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \
-	(IRO[40].base + ((pf_id) * IRO[40].m1))
-#define XSTORM_ISCSI_TX_STATS_SIZE			(IRO[40].size)
+	(IRO[41].base + ((pf_id) * IRO[41].m1))
+#define XSTORM_ISCSI_TX_STATS_SIZE			(IRO[41].size)
 
 /* Ystorm iSCSI TX stats */
 #define YSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \
-	(IRO[41].base + ((pf_id) * IRO[41].m1))
-#define YSTORM_ISCSI_TX_STATS_SIZE			(IRO[41].size)
+	(IRO[42].base + ((pf_id) * IRO[42].m1))
+#define YSTORM_ISCSI_TX_STATS_SIZE			(IRO[42].size)
 
 /* Pstorm iSCSI TX stats */
 #define PSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \
-	(IRO[42].base + ((pf_id) * IRO[42].m1))
-#define PSTORM_ISCSI_TX_STATS_SIZE			(IRO[42].size)
+	(IRO[43].base + ((pf_id) * IRO[43].m1))
+#define PSTORM_ISCSI_TX_STATS_SIZE			(IRO[43].size)
 
 /* Tstorm FCoE RX stats */
 #define TSTORM_FCOE_RX_STATS_OFFSET(pf_id) \
-	(IRO[43].base + ((pf_id) * IRO[43].m1))
-#define TSTORM_FCOE_RX_STATS_SIZE			(IRO[43].size)
+	(IRO[44].base + ((pf_id) * IRO[44].m1))
+#define TSTORM_FCOE_RX_STATS_SIZE			(IRO[44].size)
 
 /* Pstorm FCoE TX stats */
 #define PSTORM_FCOE_TX_STATS_OFFSET(pf_id) \
-	(IRO[44].base + ((pf_id) * IRO[44].m1))
-#define PSTORM_FCOE_TX_STATS_SIZE			(IRO[44].size)
+	(IRO[45].base + ((pf_id) * IRO[45].m1))
+#define PSTORM_FCOE_TX_STATS_SIZE			(IRO[45].size)
 
 /* Pstorm RDMA queue statistics */
 #define PSTORM_RDMA_QUEUE_STAT_OFFSET(rdma_stat_counter_id) \
-	(IRO[45].base + ((rdma_stat_counter_id) * IRO[45].m1))
-#define PSTORM_RDMA_QUEUE_STAT_SIZE			(IRO[45].size)
+	(IRO[46].base + ((rdma_stat_counter_id) * IRO[46].m1))
+#define PSTORM_RDMA_QUEUE_STAT_SIZE			(IRO[46].size)
 
 /* Tstorm RDMA queue statistics */
 #define TSTORM_RDMA_QUEUE_STAT_OFFSET(rdma_stat_counter_id) \
-	(IRO[46].base + ((rdma_stat_counter_id) * IRO[46].m1))
-#define TSTORM_RDMA_QUEUE_STAT_SIZE			(IRO[46].size)
+	(IRO[47].base + ((rdma_stat_counter_id) * IRO[47].m1))
+#define TSTORM_RDMA_QUEUE_STAT_SIZE			(IRO[47].size)
 
 /* Xstorm error level for assert */
 #define XSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
-	(IRO[47].base +	((pf_id) * IRO[47].m1))
-#define XSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[47].size)
+	(IRO[48].base +	((pf_id) * IRO[48].m1))
+#define XSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[48].size)
 
 /* Ystorm error level for assert */
 #define YSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
-	(IRO[48].base + ((pf_id) * IRO[48].m1))
-#define YSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[48].size)
+	(IRO[49].base + ((pf_id) * IRO[49].m1))
+#define YSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[49].size)
 
 /* Pstorm error level for assert */
 #define PSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
-	(IRO[49].base +	((pf_id) * IRO[49].m1))
-#define PSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[49].size)
+	(IRO[50].base +	((pf_id) * IRO[50].m1))
+#define PSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[50].size)
 
 /* Tstorm error level for assert */
 #define TSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
-	(IRO[50].base +	((pf_id) * IRO[50].m1))
-#define TSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[50].size)
+	(IRO[51].base +	((pf_id) * IRO[51].m1))
+#define TSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[51].size)
 
 /* Mstorm error level for assert */
 #define MSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
-	(IRO[51].base + ((pf_id) * IRO[51].m1))
-#define MSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[51].size)
+	(IRO[52].base + ((pf_id) * IRO[52].m1))
+#define MSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[52].size)
 
 /* Ustorm error level for assert */
 #define USTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
-	(IRO[52].base + ((pf_id) * IRO[52].m1))
-#define USTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[52].size)
+	(IRO[53].base + ((pf_id) * IRO[53].m1))
+#define USTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[53].size)
 
 /* Xstorm iWARP rxmit stats */
 #define XSTORM_IWARP_RXMIT_STATS_OFFSET(pf_id) \
-	(IRO[53].base +	((pf_id) * IRO[53].m1))
-#define XSTORM_IWARP_RXMIT_STATS_SIZE			(IRO[53].size)
+	(IRO[54].base +	((pf_id) * IRO[54].m1))
+#define XSTORM_IWARP_RXMIT_STATS_SIZE			(IRO[54].size)
 
 /* Tstorm RoCE Event Statistics */
 #define TSTORM_ROCE_EVENTS_STAT_OFFSET(roce_pf_id) \
-	(IRO[54].base + ((roce_pf_id) * IRO[54].m1))
-#define TSTORM_ROCE_EVENTS_STAT_SIZE			(IRO[54].size)
+	(IRO[55].base + ((roce_pf_id) * IRO[55].m1))
+#define TSTORM_ROCE_EVENTS_STAT_SIZE			(IRO[55].size)
 
 /* DCQCN Received Statistics */
 #define YSTORM_ROCE_DCQCN_RECEIVED_STATS_OFFSET(roce_pf_id) \
-	(IRO[55].base + ((roce_pf_id) * IRO[55].m1))
-#define YSTORM_ROCE_DCQCN_RECEIVED_STATS_SIZE		(IRO[55].size)
+	(IRO[56].base + ((roce_pf_id) * IRO[56].m1))
+#define YSTORM_ROCE_DCQCN_RECEIVED_STATS_SIZE		(IRO[56].size)
 
 /* RoCE Error Statistics */
 #define YSTORM_ROCE_ERROR_STATS_OFFSET(roce_pf_id) \
-	(IRO[56].base + ((roce_pf_id) * IRO[56].m1))
-#define YSTORM_ROCE_ERROR_STATS_SIZE			(IRO[56].size)
+	(IRO[57].base + ((roce_pf_id) * IRO[57].m1))
+#define YSTORM_ROCE_ERROR_STATS_SIZE			(IRO[57].size)
 
 /* DCQCN Sent Statistics */
 #define PSTORM_ROCE_DCQCN_SENT_STATS_OFFSET(roce_pf_id) \
-	(IRO[57].base + ((roce_pf_id) * IRO[57].m1))
-#define PSTORM_ROCE_DCQCN_SENT_STATS_SIZE		(IRO[57].size)
+	(IRO[58].base + ((roce_pf_id) * IRO[58].m1))
+#define PSTORM_ROCE_DCQCN_SENT_STATS_SIZE		(IRO[58].size)
 
 /* RoCE CQEs Statistics */
 #define USTORM_ROCE_CQE_STATS_OFFSET(roce_pf_id) \
-	(IRO[58].base + ((roce_pf_id) * IRO[58].m1))
-#define USTORM_ROCE_CQE_STATS_SIZE			(IRO[58].size)
+	(IRO[59].base + ((roce_pf_id) * IRO[59].m1))
+#define USTORM_ROCE_CQE_STATS_SIZE			(IRO[59].size)
 
-static const struct iro iro_arr[59] = {
+static const struct iro iro_arr[60] = {
 	{0x0, 0x0, 0x0, 0x0, 0x8},
 	{0x4cb8, 0x88, 0x0, 0x0, 0x88},
 	{0x6530, 0x20, 0x0, 0x0, 0x20},
@@ -4461,14 +4536,14 @@ static const struct iro iro_arr[59] = {
 	{0x84, 0x8, 0x0, 0x0, 0x2},
 	{0x4c48, 0x0, 0x0, 0x0, 0x78},
 	{0x3e38, 0x0, 0x0, 0x0, 0x78},
-	{0x2b78, 0x0, 0x0, 0x0, 0x78},
+	{0x3ef8, 0x0, 0x0, 0x0, 0x78},
 	{0x4c40, 0x0, 0x0, 0x0, 0x78},
 	{0x4998, 0x0, 0x0, 0x0, 0x78},
 	{0x7f50, 0x0, 0x0, 0x0, 0x78},
 	{0xa28, 0x8, 0x0, 0x0, 0x8},
 	{0x6210, 0x10, 0x0, 0x0, 0x10},
 	{0xb820, 0x30, 0x0, 0x0, 0x30},
-	{0x96c0, 0x30, 0x0, 0x0, 0x30},
+	{0xa990, 0x30, 0x0, 0x0, 0x30},
 	{0x4b68, 0x80, 0x0, 0x0, 0x40},
 	{0x1f8, 0x4, 0x0, 0x0, 0x4},
 	{0x53a8, 0x80, 0x4, 0x0, 0x4},
@@ -4476,11 +4551,12 @@ static const struct iro iro_arr[59] = {
 	{0x4ba8, 0x80, 0x0, 0x0, 0x20},
 	{0x8158, 0x40, 0x0, 0x0, 0x30},
 	{0xe770, 0x60, 0x0, 0x0, 0x60},
-	{0x2d10, 0x80, 0x0, 0x0, 0x38},
-	{0xf2b8, 0x78, 0x0, 0x0, 0x78},
+	{0x4090, 0x80, 0x0, 0x0, 0x38},
+	{0xfea8, 0x78, 0x0, 0x0, 0x78},
 	{0x1f8, 0x4, 0x0, 0x0, 0x4},
 	{0xaf20, 0x0, 0x0, 0x0, 0xf0},
 	{0xb010, 0x8, 0x0, 0x0, 0x8},
+	{0xc00, 0x8, 0x0, 0x0, 0x8},
 	{0x1f8, 0x8, 0x0, 0x0, 0x8},
 	{0xac0, 0x8, 0x0, 0x0, 0x8},
 	{0x2578, 0x8, 0x0, 0x0, 0x8},
@@ -4492,23 +4568,23 @@ static const struct iro iro_arr[59] = {
 	{0x12908, 0x18, 0x0, 0x0, 0x10},
 	{0x11aa8, 0x40, 0x0, 0x0, 0x18},
 	{0xa588, 0x50, 0x0, 0x0, 0x20},
-	{0x8700, 0x40, 0x0, 0x0, 0x28},
-	{0x10300, 0x18, 0x0, 0x0, 0x10},
+	{0x8f00, 0x40, 0x0, 0x0, 0x28},
+	{0x10e30, 0x18, 0x0, 0x0, 0x10},
 	{0xde48, 0x48, 0x0, 0x0, 0x38},
-	{0x10768, 0x20, 0x0, 0x0, 0x20},
-	{0x2d48, 0x80, 0x0, 0x0, 0x10},
+	{0x11298, 0x20, 0x0, 0x0, 0x20},
+	{0x40c8, 0x80, 0x0, 0x0, 0x10},
 	{0x5048, 0x10, 0x0, 0x0, 0x10},
 	{0xc748, 0x8, 0x0, 0x0, 0x1},
-	{0xa128, 0x8, 0x0, 0x0, 0x1},
-	{0x10f00, 0x8, 0x0, 0x0, 0x1},
+	{0xa928, 0x8, 0x0, 0x0, 0x1},
+	{0x11a30, 0x8, 0x0, 0x0, 0x1},
 	{0xf030, 0x8, 0x0, 0x0, 0x1},
 	{0x13028, 0x8, 0x0, 0x0, 0x1},
 	{0x12c58, 0x8, 0x0, 0x0, 0x1},
 	{0xc9b8, 0x30, 0x0, 0x0, 0x10},
 	{0xed90, 0x28, 0x0, 0x0, 0x28},
-	{0xa520, 0x18, 0x0, 0x0, 0x18},
-	{0xa6a0, 0x8, 0x0, 0x0, 0x8},
-	{0x13108, 0x8, 0x0, 0x0, 0x8},
+	{0xad20, 0x18, 0x0, 0x0, 0x18},
+	{0xaea0, 0x8, 0x0, 0x0, 0x8},
+	{0x13c38, 0x8, 0x0, 0x0, 0x8},
 	{0x13c50, 0x18, 0x0, 0x0, 0x18},
 };
 
@@ -5661,6 +5737,14 @@ enum eth_filter_type {
 	MAX_ETH_FILTER_TYPE
 };
 
+/* inner to inner vlan priority translation configurations */
+struct eth_in_to_in_pri_map_cfg {
+	u8 inner_vlan_pri_remap_en;
+	u8 reserved[7];
+	u8 non_rdma_in_to_in_pri_map[8];
+	u8 rdma_in_to_in_pri_map[8];
+};
+
 /* Eth IPv4 Fragment Type */
 enum eth_ipv4_frag_type {
 	ETH_IPV4_NOT_FRAG,
@@ -6018,6 +6102,14 @@ struct tx_queue_update_ramrod_data {
 	struct regpair reserved1[5];
 };
 
+/* Inner to Inner VLAN priority map update mode */
+enum update_in_to_in_pri_map_mode_enum {
+	ETH_IN_TO_IN_PRI_MAP_UPDATE_DISABLED,
+	ETH_IN_TO_IN_PRI_MAP_UPDATE_NON_RDMA_TBL,
+	ETH_IN_TO_IN_PRI_MAP_UPDATE_RDMA_TBL,
+	MAX_UPDATE_IN_TO_IN_PRI_MAP_MODE_ENUM
+};
+
 /* Ramrod data for vport update ramrod */
 struct vport_filter_update_ramrod_data {
 	struct eth_filter_cmd_header filter_cmd_hdr;
@@ -6048,7 +6140,8 @@ struct vport_start_ramrod_data {
 	u8 zero_placement_offset;
 	u8 ctl_frame_mac_check_en;
 	u8 ctl_frame_ethtype_check_en;
-	u8 reserved[1];
+	u8 wipe_inner_vlan_pri_en;
+	struct eth_in_to_in_pri_map_cfg in_to_in_vlan_pri_map_cfg;
 };
 
 /* Ramrod data for vport stop ramrod */
@@ -6100,7 +6193,9 @@ struct vport_update_ramrod_data_cmn {
 	u8 update_ctl_frame_checks_en_flg;
 	u8 ctl_frame_mac_check_en;
 	u8 ctl_frame_ethtype_check_en;
-	u8 reserved[15];
+	u8 update_in_to_in_pri_map_mode;
+	u8 in_to_in_pri_map[8];
+	u8 reserved[6];
 };
 
 struct vport_update_ramrod_mcast {
@@ -6929,11 +7024,6 @@ struct mstorm_rdma_task_st_ctx {
 	struct regpair temp[4];
 };
 
-/* The roce task context of Ustorm */
-struct ustorm_rdma_task_st_ctx {
-	struct regpair temp[2];
-};
-
 struct e4_ustorm_rdma_task_ag_ctx {
 	u8 reserved;
 	u8 state;
@@ -7007,8 +7097,6 @@ struct e4_rdma_task_context {
 	struct e4_mstorm_rdma_task_ag_ctx mstorm_ag_context;
 	struct mstorm_rdma_task_st_ctx mstorm_st_context;
 	struct rdif_task_context rdif_context;
-	struct ustorm_rdma_task_st_ctx ustorm_st_context;
-	struct regpair ustorm_st_padding[2];
 	struct e4_ustorm_rdma_task_ag_ctx ustorm_ag_context;
 };
 
@@ -7388,7 +7476,7 @@ struct e4_ustorm_rdma_conn_ag_ctx {
 #define E4_USTORM_RDMA_CONN_AG_CTX_RULE8EN_MASK		0x1
 #define E4_USTORM_RDMA_CONN_AG_CTX_RULE8EN_SHIFT	7
 	u8 byte2;
-	u8 byte3;
+	u8 nvmf_only;
 	__le16 conn_dpi;
 	__le16 word1;
 	__le32 cq_cons;
@@ -7831,7 +7919,12 @@ struct roce_create_qp_req_ramrod_data {
 	struct regpair qp_handle_for_cqe;
 	struct regpair qp_handle_for_async;
 	u8 stats_counter_id;
-	u8 reserved3[7];
+	u8 reserved3[6];
+	u8 flags2;
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_EDPM_MODE_MASK			0x1
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_EDPM_MODE_SHIFT			0
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_RESERVED_MASK			0x7F
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_RESERVED_SHIFT			1
 	__le16 regular_latency_phy_queue;
 	__le16 dpi;
 };
@@ -7954,6 +8047,7 @@ enum roce_event_opcode {
 	ROCE_EVENT_DESTROY_QP,
 	ROCE_EVENT_CREATE_UD_QP,
 	ROCE_EVENT_DESTROY_UD_QP,
+	ROCE_EVENT_FUNC_UPDATE,
 	MAX_ROCE_EVENT_OPCODE
 };
 
@@ -7962,7 +8056,13 @@ struct roce_init_func_params {
 	u8 ll2_queue_id;
 	u8 cnp_vlan_priority;
 	u8 cnp_dscp;
-	u8 reserved;
+	u8 flags;
+#define ROCE_INIT_FUNC_PARAMS_DCQCN_NP_EN_MASK		0x1
+#define ROCE_INIT_FUNC_PARAMS_DCQCN_NP_EN_SHIFT		0
+#define ROCE_INIT_FUNC_PARAMS_DCQCN_RP_EN_MASK		0x1
+#define ROCE_INIT_FUNC_PARAMS_DCQCN_RP_EN_SHIFT		1
+#define ROCE_INIT_FUNC_PARAMS_RESERVED0_MASK		0x3F
+#define ROCE_INIT_FUNC_PARAMS_RESERVED0_SHIFT		2
 	__le32 cnp_send_timeout;
 	__le16 rl_offset;
 	u8 rl_count_log;
@@ -8109,9 +8209,24 @@ enum roce_ramrod_cmd_id {
 	ROCE_RAMROD_DESTROY_QP,
 	ROCE_RAMROD_CREATE_UD_QP,
 	ROCE_RAMROD_DESTROY_UD_QP,
+	ROCE_RAMROD_FUNC_UPDATE,
 	MAX_ROCE_RAMROD_CMD_ID
 };
 
+/* RoCE func init ramrod data */
+struct roce_update_func_params {
+	u8 cnp_vlan_priority;
+	u8 cnp_dscp;
+	__le16 flags;
+#define ROCE_UPDATE_FUNC_PARAMS_DCQCN_NP_EN_MASK	0x1
+#define ROCE_UPDATE_FUNC_PARAMS_DCQCN_NP_EN_SHIFT	0
+#define ROCE_UPDATE_FUNC_PARAMS_DCQCN_RP_EN_MASK	0x1
+#define ROCE_UPDATE_FUNC_PARAMS_DCQCN_RP_EN_SHIFT	1
+#define ROCE_UPDATE_FUNC_PARAMS_RESERVED0_MASK		0x3FFF
+#define ROCE_UPDATE_FUNC_PARAMS_RESERVED0_SHIFT		2
+	__le32 cnp_send_timeout;
+};
+
 struct e4_xstorm_roce_conn_ag_ctx_dq_ext_ld_part {
 	u8 reserved0;
 	u8 state;
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 0081fa6d1268..03f59a28fefd 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -110,7 +110,7 @@
 
 #define FW_MAJOR_VERSION	8
 #define FW_MINOR_VERSION        37
-#define FW_REVISION_VERSION     2
+#define FW_REVISION_VERSION     7
 #define FW_ENGINEERING_VERSION	0
 
 /***********************/
@@ -931,12 +931,12 @@ struct db_rdma_dpm_params {
 #define DB_RDMA_DPM_PARAMS_WQE_SIZE_SHIFT		16
 #define DB_RDMA_DPM_PARAMS_RESERVED0_MASK		0x1
 #define DB_RDMA_DPM_PARAMS_RESERVED0_SHIFT		27
-#define DB_RDMA_DPM_PARAMS_COMPLETION_FLG_MASK		0x1
-#define DB_RDMA_DPM_PARAMS_COMPLETION_FLG_SHIFT		28
+#define DB_RDMA_DPM_PARAMS_ACK_REQUEST_MASK		0x1
+#define DB_RDMA_DPM_PARAMS_ACK_REQUEST_SHIFT		28
 #define DB_RDMA_DPM_PARAMS_S_FLG_MASK			0x1
 #define DB_RDMA_DPM_PARAMS_S_FLG_SHIFT			29
-#define DB_RDMA_DPM_PARAMS_RESERVED1_MASK		0x1
-#define DB_RDMA_DPM_PARAMS_RESERVED1_SHIFT		30
+#define DB_RDMA_DPM_PARAMS_COMPLETION_FLG_MASK		0x1
+#define DB_RDMA_DPM_PARAMS_COMPLETION_FLG_SHIFT		30
 #define DB_RDMA_DPM_PARAMS_CONN_TYPE_IS_IWARP_MASK	0x1
 #define DB_RDMA_DPM_PARAMS_CONN_TYPE_IS_IWARP_SHIFT	31
 };
diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h
index b34c573f2b30..66aba505ec56 100644
--- a/include/linux/qed/iscsi_common.h
+++ b/include/linux/qed/iscsi_common.h
@@ -896,7 +896,7 @@ struct e4_ustorm_iscsi_task_ag_ctx {
 	__le32 exp_cont_len;
 	__le32 total_data_acked;
 	__le32 exp_data_acked;
-	u8 next_tid_valid;
+	u8 byte2;
 	u8 byte3;
 	__le16 word1;
 	__le16 next_tid;
-- 
cgit v1.2.3


From 0153167aebd0808fb90031dba07d4e696557474c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 10 Sep 2018 09:11:28 -0700
Subject: net/ipv6: Remove rt6i_prefsrc

After the conversion to fib6_info, rt6i_prefsrc has a single user that
reads the value and otherwise it is only set. The one reader can be
converted to use rt->from so rt6i_prefsrc can be removed, reducing
rt6_info by another 20 bytes.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/scsi/cxgbi/libcxgbi.c |  4 ++--
 include/net/ip6_fib.h         |  1 -
 net/ipv6/route.c              | 27 ---------------------------
 3 files changed, 2 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 3f3af5e74a07..6b3ea50c594e 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -784,7 +784,7 @@ cxgbi_check_route6(struct sockaddr *dst_addr, int ifindex)
 	csk->mtu = mtu;
 	csk->dst = dst;
 
-	if (ipv6_addr_any(&rt->rt6i_prefsrc.addr)) {
+	if (!rt->from || ipv6_addr_any(&rt->from->fib6_prefsrc.addr)) {
 		struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt);
 
 		err = ipv6_dev_get_saddr(&init_net, idev ? idev->dev : NULL,
@@ -795,7 +795,7 @@ cxgbi_check_route6(struct sockaddr *dst_addr, int ifindex)
 			goto rel_rt;
 		}
 	} else {
-		pref_saddr = rt->rt6i_prefsrc.addr;
+		pref_saddr = rt->from->fib6_prefsrc.addr;
 	}
 
 	csk->csk_family = AF_INET6;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 3d4930528db0..c7496663f99a 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -182,7 +182,6 @@ struct rt6_info {
 	struct in6_addr			rt6i_gateway;
 	struct inet6_dev		*rt6i_idev;
 	u32				rt6i_flags;
-	struct rt6key			rt6i_prefsrc;
 
 	struct list_head		rt6i_uncached;
 	struct uncached_list		*rt6i_uncached_list;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0fa62acc923c..b3268eaa394b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -995,7 +995,6 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 #ifdef CONFIG_IPV6_SUBTREES
 	rt->rt6i_src = ort->fib6_src;
 #endif
-	rt->rt6i_prefsrc = ort->fib6_prefsrc;
 }
 
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
@@ -1449,11 +1448,6 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 	if (ort->fib6_src.plen)
 		src_key = &nrt->rt6i_src.addr;
 #endif
-
-	/* Update rt6i_prefsrc as it could be changed
-	 * in rt6_remove_prefsrc()
-	 */
-	nrt->rt6i_prefsrc = ort->fib6_prefsrc;
 	/* rt6_mtu_change() might lower mtu on ort.
 	 * Only insert this exception route if its mtu
 	 * is less than ort's mtu value.
@@ -1635,25 +1629,6 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
 	rcu_read_unlock();
 }
 
-static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
-{
-	struct rt6_exception_bucket *bucket;
-	struct rt6_exception *rt6_ex;
-	int i;
-
-	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
-					lockdep_is_held(&rt6_exception_lock));
-
-	if (bucket) {
-		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
-			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
-				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
-			}
-			bucket++;
-		}
-	}
-}
-
 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
 					 struct rt6_info *rt, int mtu)
 {
@@ -3793,8 +3768,6 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
 		spin_lock_bh(&rt6_exception_lock);
 		/* remove prefsrc entry */
 		rt->fib6_prefsrc.plen = 0;
-		/* need to update cache as well */
-		rt6_exceptions_remove_prefsrc(rt);
 		spin_unlock_bh(&rt6_exception_lock);
 	}
 	return 0;
-- 
cgit v1.2.3


From aea890b8b2e071bb75043353581f2197a2f13160 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 29 Jul 2018 16:22:13 -0700
Subject: sch_htb: Remove local SKB queue handling code.

Instead, adjust __qdisc_enqueue_tail() such that HTB can use it
instead.

The only other caller of __qdisc_enqueue_tail() is
qdisc_enqueue_tail() so we can move the backlog and return value
handling (which HTB doesn't need/want) to the latter.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 11 +++++------
 net/sched/sch_htb.c       | 18 +-----------------
 2 files changed, 6 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a6d00093f35e..bc8f6b0b6610 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -828,8 +828,8 @@ static inline void qdisc_skb_head_init(struct qdisc_skb_head *qh)
 	qh->qlen = 0;
 }
 
-static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
-				       struct qdisc_skb_head *qh)
+static inline void __qdisc_enqueue_tail(struct sk_buff *skb,
+					struct qdisc_skb_head *qh)
 {
 	struct sk_buff *last = qh->tail;
 
@@ -842,14 +842,13 @@ static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
 		qh->head = skb;
 	}
 	qh->qlen++;
-	qdisc_qstats_backlog_inc(sch, skb);
-
-	return NET_XMIT_SUCCESS;
 }
 
 static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
 {
-	return __qdisc_enqueue_tail(skb, sch, &sch->q);
+	__qdisc_enqueue_tail(skb, &sch->q);
+	qdisc_qstats_backlog_inc(sch, skb);
+	return NET_XMIT_SUCCESS;
 }
 
 static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 43c4bfe625a9..cf23829cbede 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -577,22 +577,6 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
 	cl->prio_activity = 0;
 }
 
-static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
-			     struct qdisc_skb_head *qh)
-{
-	struct sk_buff *last = qh->tail;
-
-	if (last) {
-		skb->next = NULL;
-		last->next = skb;
-		qh->tail = skb;
-	} else {
-		qh->tail = skb;
-		qh->head = skb;
-	}
-	qh->qlen++;
-}
-
 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		       struct sk_buff **to_free)
 {
@@ -603,7 +587,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (cl == HTB_DIRECT) {
 		/* enqueue to helper queue */
 		if (q->direct_queue.qlen < q->direct_qlen) {
-			htb_enqueue_tail(skb, sch, &q->direct_queue);
+			__qdisc_enqueue_tail(skb, &q->direct_queue);
 			q->direct_pkts++;
 		} else {
 			return qdisc_drop(skb, sch, to_free);
-- 
cgit v1.2.3


From 596977300ab5c5d5d85f7950dd7f299f8322e533 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 29 Jul 2018 16:33:28 -0700
Subject: sch_netem: Move private queue handler to generic location.

By hand copies of SKB list handlers do not belong in individual packet
schedulers.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 11 +++++++++++
 net/sched/sch_netem.c     | 12 +-----------
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index bc8f6b0b6610..fdaa5506e6f7 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -851,6 +851,17 @@ static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
 	return NET_XMIT_SUCCESS;
 }
 
+static inline void __qdisc_enqueue_head(struct sk_buff *skb,
+					struct qdisc_skb_head *qh)
+{
+	skb->next = qh->head;
+
+	if (!qh->head)
+		qh->tail = skb;
+	qh->head = skb;
+	qh->qlen++;
+}
+
 static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
 {
 	struct sk_buff *skb = qh->head;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index ad18a2052416..b9541ce4d672 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -412,16 +412,6 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
 	return segs;
 }
 
-static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
-{
-	skb->next = qh->head;
-
-	if (!qh->head)
-		qh->tail = skb;
-	qh->head = skb;
-	qh->qlen++;
-}
-
 /*
  * Insert one skb into qdisc.
  * Note: parent depends on return value to account for queue length.
@@ -570,7 +560,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		cb->time_to_send = ktime_get_ns();
 		q->counter = 0;
 
-		netem_enqueue_skb_head(&sch->q, skb);
+		__qdisc_enqueue_head(skb, &sch->q);
 		sch->qstats.requeues++;
 	}
 
-- 
cgit v1.2.3


From 8b69bd7d8a8927d537f134c37bcca6cbfa58e1b2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 11 Aug 2018 18:43:38 -0700
Subject: ppp: Remove direct skb_queue_head list pointer access.

Add a helper, __skb_peek(), and use it in ppp_mp_reconstruct().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/ppp_generic.c |  2 +-
 include/linux/skbuff.h        | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 02ad03a2fab7..500bc0027c1b 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -2400,7 +2400,7 @@ ppp_mp_reconstruct(struct ppp *ppp)
 
 	if (ppp->mrru == 0)	/* do nothing until mrru is set */
 		return NULL;
-	head = list->next;
+	head = __skb_peek(list);
 	tail = NULL;
 	skb_queue_walk_safe(list, p, tmp) {
 	again:
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 17a13e4785fc..89283b77294d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1592,6 +1592,17 @@ static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_)
 	return skb;
 }
 
+/**
+ *	__skb_peek - peek at the head of a non-empty &sk_buff_head
+ *	@list_: list to peek at
+ *
+ *	Like skb_peek(), but the caller knows that the list is not empty.
+ */
+static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_)
+{
+	return list_->next;
+}
+
 /**
  *	skb_peek_next - peek skb following the given one from a queue
  *	@skb: skb to start from
-- 
cgit v1.2.3


From a8305bff685252e80b7c60f4f5e7dd2e63e38218 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 29 Jul 2018 20:42:53 -0700
Subject: net: Add and use skb_mark_not_on_list().

An SKB is not on a list if skb->next is NULL.

Codify this convention into a helper function and use it
where we are dequeueing an SKB and need to mark it as such.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h                  | 5 +++++
 net/core/dev.c                          | 8 ++++----
 net/core/sock.c                         | 2 +-
 net/ieee802154/6lowpan/reassembly.c     | 2 +-
 net/ipv4/ip_fragment.c                  | 2 +-
 net/ipv4/ip_input.c                     | 2 +-
 net/ipv4/ip_output.c                    | 4 ++--
 net/ipv6/ip6_output.c                   | 2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +-
 net/ipv6/reassembly.c                   | 2 +-
 net/netfilter/nfnetlink_queue.c         | 2 +-
 net/rxrpc/input.c                       | 2 +-
 net/sched/sch_cake.c                    | 6 +++---
 net/sched/sch_fq.c                      | 2 +-
 net/sched/sch_fq_codel.c                | 2 +-
 net/sched/sch_generic.c                 | 4 ++--
 net/sched/sch_hhf.c                     | 2 +-
 net/sched/sch_netem.c                   | 2 +-
 net/sched/sch_tbf.c                     | 2 +-
 net/tipc/bearer.c                       | 2 +-
 net/xfrm/xfrm_device.c                  | 2 +-
 net/xfrm/xfrm_output.c                  | 2 +-
 22 files changed, 33 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 89283b77294d..c4c9e3f5cd9a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1339,6 +1339,11 @@ static inline void skb_zcopy_abort(struct sk_buff *skb)
 	}
 }
 
+static inline void skb_mark_not_on_list(struct sk_buff *skb)
+{
+	skb->next = NULL;
+}
+
 /**
  *	skb_queue_empty - check if a queue is empty
  *	@list: queue head
diff --git a/net/core/dev.c b/net/core/dev.c
index ca78dc5a79a3..f76dd7e14dd6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3231,7 +3231,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de
 	while (skb) {
 		struct sk_buff *next = skb->next;
 
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		rc = xmit_one(skb, dev, txq, next != NULL);
 		if (unlikely(!dev_xmit_complete(rc))) {
 			skb->next = next;
@@ -3331,7 +3331,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
 
 	for (; skb != NULL; skb = next) {
 		next = skb->next;
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 
 		/* in case skb wont be segmented, point to itself */
 		skb->prev = skb;
@@ -5296,7 +5296,7 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 			return;
 		list_del(&skb->list);
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		napi_gro_complete(skb);
 		napi->gro_hash[index].count--;
 	}
@@ -5482,7 +5482,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 
 	if (pp) {
 		list_del(&pp->list);
-		pp->next = NULL;
+		skb_mark_not_on_list(pp);
 		napi_gro_complete(pp);
 		napi->gro_hash[hash].count--;
 	}
diff --git a/net/core/sock.c b/net/core/sock.c
index 3730eb855095..8537b6ca72c5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2332,7 +2332,7 @@ static void __release_sock(struct sock *sk)
 			next = skb->next;
 			prefetch(next);
 			WARN_ON_ONCE(skb_dst_is_noref(skb));
-			skb->next = NULL;
+			skb_mark_not_on_list(skb);
 			sk_backlog_rcv(sk, skb);
 
 			cond_resched();
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index e7857a8ac86d..09ffbf5ce8fa 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -260,7 +260,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 	}
 	sub_frag_mem_limit(fq->q.net, sum_truesize);
 
-	head->next = NULL;
+	skb_mark_not_on_list(head);
 	head->dev = ldev;
 	head->tstamp = fq->q.stamp;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 330f62353b11..cab3e4a5124b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -623,7 +623,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 	sub_frag_mem_limit(qp->q.net, head->truesize);
 
 	*nextp = NULL;
-	head->next = NULL;
+	skb_mark_not_on_list(head);
 	head->prev = NULL;
 	head->dev = dev;
 	head->tstamp = qp->q.stamp;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3196cf58f418..eba7f3883230 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -535,7 +535,7 @@ static void ip_sublist_rcv_finish(struct list_head *head)
 		/* Handle ip{6}_forward case, as sch_direct_xmit have
 		 * another kind of SKB-list usage (see validate_xmit_skb_list)
 		 */
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		dst_input(skb);
 	}
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9c4e72e9c60a..c09219e7f230 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -278,7 +278,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
 		struct sk_buff *nskb = segs->next;
 		int err;
 
-		segs->next = NULL;
+		skb_mark_not_on_list(segs);
 		err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
 
 		if (err && ret == 0)
@@ -684,7 +684,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 
 			skb = frag;
 			frag = skb->next;
-			skb->next = NULL;
+			skb_mark_not_on_list(skb);
 		}
 
 		if (err == 0) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 16f200f06500..9a8934ac053b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -727,7 +727,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 
 			skb = frag;
 			frag = skb->next;
-			skb->next = NULL;
+			skb_mark_not_on_list(skb);
 		}
 
 		kfree(tmp_hdr);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 2a14d8b65924..00e20004d241 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -449,7 +449,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev,  struct net_devic
 	sub_frag_mem_limit(fq->q.net, head->truesize);
 
 	head->ignore_df = 1;
-	head->next = NULL;
+	skb_mark_not_on_list(head);
 	head->dev = dev;
 	head->tstamp = fq->q.stamp;
 	ipv6_hdr(head)->payload_len = htons(payload_len);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5c5b4f79296e..f1b1ff30fe5b 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -388,7 +388,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	}
 	sub_frag_mem_limit(fq->q.net, sum_truesize);
 
-	head->next = NULL;
+	skb_mark_not_on_list(head);
 	head->dev = dev;
 	head->tstamp = fq->q.stamp;
 	ipv6_hdr(head)->payload_len = htons(payload_len);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index ea4ba551abb2..5207eb8a5864 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -764,7 +764,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
 		return ret;
 	}
 
-	skb->next = NULL;
+	skb_mark_not_on_list(skb);
 
 	entry_seg = nf_queue_entry_dup(entry);
 	if (entry_seg) {
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index cfdc199c6351..ee8e7e1d5c0f 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -259,7 +259,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
 	while (list) {
 		skb = list;
 		list = skb->next;
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 	}
 }
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index c07c30b916d5..dc539295ae65 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -812,7 +812,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow)
 
 	if (skb) {
 		flow->head = skb->next;
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 	}
 
 	return skb;
@@ -1252,7 +1252,7 @@ found:
 	else
 		flow->head = elig_ack->next;
 
-	elig_ack->next = NULL;
+	skb_mark_not_on_list(elig_ack);
 
 	return elig_ack;
 }
@@ -1675,7 +1675,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 		while (segs) {
 			nskb = segs->next;
-			segs->next = NULL;
+			skb_mark_not_on_list(segs);
 			qdisc_skb_cb(segs)->pkt_len = segs->len;
 			cobalt_set_enqueue_time(segs, now);
 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4808713c73b9..b27ba36a269c 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -319,7 +319,7 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
 
 	if (skb) {
 		flow->head = skb->next;
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		flow->qlen--;
 		qdisc_qstats_backlog_dec(sch, skb);
 		sch->q.qlen--;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 6c0a9d5dbf94..cd04d40c30b6 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -124,7 +124,7 @@ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
 	struct sk_buff *skb = flow->head;
 
 	flow->head = skb->next;
-	skb->next = NULL;
+	skb_mark_not_on_list(skb);
 	return skb;
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 69078c82963e..a64132a5db36 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -184,7 +184,7 @@ static void try_bulk_dequeue_skb(struct Qdisc *q,
 		skb = nskb;
 		(*packets)++; /* GSO counts as one pkt */
 	}
-	skb->next = NULL;
+	skb_mark_not_on_list(skb);
 }
 
 /* This variant of try_bulk_dequeue_skb() makes sure
@@ -210,7 +210,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
 		skb = nskb;
 	} while (++cnt < 8);
 	(*packets) += cnt;
-	skb->next = NULL;
+	skb_mark_not_on_list(skb);
 }
 
 /* Note that dequeue_skb can possibly return a SKB list (via skb->next).
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index c3a8388dcdf6..9d6a47697406 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -330,7 +330,7 @@ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
 	struct sk_buff *skb = bucket->head;
 
 	bucket->head = skb->next;
-	skb->next = NULL;
+	skb_mark_not_on_list(skb);
 	return skb;
 }
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index b9541ce4d672..506e1960ed7f 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -568,7 +568,7 @@ finish_segs:
 	if (segs) {
 		while (segs) {
 			skb2 = segs->next;
-			segs->next = NULL;
+			skb_mark_not_on_list(segs);
 			qdisc_skb_cb(segs)->pkt_len = segs->len;
 			last_len = segs->len;
 			rc = qdisc_enqueue(segs, sch, to_free);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 6f74a426f159..a4530e85bd02 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -162,7 +162,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 	nb = 0;
 	while (segs) {
 		nskb = segs->next;
-		segs->next = NULL;
+		skb_mark_not_on_list(segs);
 		qdisc_skb_cb(segs)->pkt_len = segs->len;
 		len += segs->len;
 		ret = qdisc_enqueue(segs, q->qdisc, to_free);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 418f03d0be90..91891041e5e1 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -577,7 +577,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
 		rcu_dereference_rtnl(orig_dev->tipc_ptr);
 	if (likely(b && test_bit(0, &b->up) &&
 		   (skb->pkt_type <= PACKET_MULTICAST))) {
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		tipc_rcv(dev_net(b->pt.dev), skb, b);
 		rcu_read_unlock();
 		return NET_RX_SUCCESS;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 5611b7521020..260fbba4f03e 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -99,7 +99,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
 
 	do {
 		struct sk_buff *nskb = skb2->next;
-		skb2->next = NULL;
+		skb_mark_not_on_list(skb2);
 
 		xo = xfrm_offload(skb2);
 		xo->flags |= XFRM_DEV_RESUME;
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 45ba07ab3e4f..2d42cb0c94b8 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -189,7 +189,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
 		struct sk_buff *nskb = segs->next;
 		int err;
 
-		segs->next = NULL;
+		skb_mark_not_on_list(segs);
 		err = xfrm_output2(net, sk, segs);
 
 		if (unlikely(err)) {
-- 
cgit v1.2.3


From 992cba7e276d438ac8b0a8c17b147b37c8c286f7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 31 Jul 2018 15:27:56 -0700
Subject: net: Add and use skb_list_del_init().

It documents what is happening, and eliminates the spurious list
pointer poisoning.

In the long term, in order to get proper list head debugging, we
might want to use the list poison value as the indicator that
an SKB is a singleton and not on a list.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 ++++++
 net/core/dev.c         | 6 ++----
 net/ipv4/ip_input.c    | 6 +-----
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c4c9e3f5cd9a..e3a53ca4a9b5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1344,6 +1344,12 @@ static inline void skb_mark_not_on_list(struct sk_buff *skb)
 	skb->next = NULL;
 }
 
+static inline void skb_list_del_init(struct sk_buff *skb)
+{
+	__list_del_entry(&skb->list);
+	skb_mark_not_on_list(skb);
+}
+
 /**
  *	skb_queue_empty - check if a queue is empty
  *	@list: queue head
diff --git a/net/core/dev.c b/net/core/dev.c
index f76dd7e14dd6..0b2d777e5b9e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5295,8 +5295,7 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 	list_for_each_entry_safe_reverse(skb, p, head, list) {
 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 			return;
-		list_del(&skb->list);
-		skb_mark_not_on_list(skb);
+		skb_list_del_init(skb);
 		napi_gro_complete(skb);
 		napi->gro_hash[index].count--;
 	}
@@ -5481,8 +5480,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 
 	if (pp) {
-		list_del(&pp->list);
-		skb_mark_not_on_list(pp);
+		skb_list_del_init(pp);
 		napi_gro_complete(pp);
 		napi->gro_hash[hash].count--;
 	}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index eba7f3883230..35a786c0aaa0 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -531,11 +531,7 @@ static void ip_sublist_rcv_finish(struct list_head *head)
 	struct sk_buff *skb, *next;
 
 	list_for_each_entry_safe(skb, next, head, list) {
-		list_del(&skb->list);
-		/* Handle ip{6}_forward case, as sch_direct_xmit have
-		 * another kind of SKB-list usage (see validate_xmit_skb_list)
-		 */
-		skb_mark_not_on_list(skb);
+		skb_list_del_init(skb);
 		dst_input(skb);
 	}
 }
-- 
cgit v1.2.3


From 86c55361e569400b6286f30283a9c143a18c20d9 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Fri, 7 Sep 2018 17:22:21 +0300
Subject: net: sched: cls_flower: dump offload count value

Change flower in_hw_count type to fixed-size u32 and dump it as
TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared
blocks and re-offload functionality.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h    | 2 +-
 include/uapi/linux/pkt_cls.h | 2 ++
 net/sched/cls_flower.c       | 5 ++++-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index fdaa5506e6f7..d326fd553b58 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -362,7 +362,7 @@ static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
 }
 
 static inline void
-tc_cls_offload_cnt_update(struct tcf_block *block, unsigned int *cnt,
+tc_cls_offload_cnt_update(struct tcf_block *block, u32 *cnt,
 			  u32 *flags, bool add)
 {
 	if (add) {
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index be382fb0592d..401d0c1e612d 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -483,6 +483,8 @@ enum {
 	TCA_FLOWER_KEY_ENC_OPTS,
 	TCA_FLOWER_KEY_ENC_OPTS_MASK,
 
+	TCA_FLOWER_IN_HW_COUNT,
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 6fd9bdd93796..4b8dd37dd4f8 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -98,7 +98,7 @@ struct cls_fl_filter {
 	struct list_head list;
 	u32 handle;
 	u32 flags;
-	unsigned int in_hw_count;
+	u32 in_hw_count;
 	struct rcu_work rwork;
 	struct net_device *hw_dev;
 };
@@ -1880,6 +1880,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
 	if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
 		goto nla_put_failure;
 
+	if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count))
+		goto nla_put_failure;
+
 	if (tcf_exts_dump(skb, &f->exts))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From 67edf21e5adfd336f2ff08668eb09850943666d3 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 10 Sep 2018 17:21:42 -0700
Subject: scsi: libcxgbi: fib6_ino reference in rt6_info is rcu protected

The fib6_info reference in rt6_info is rcu protected. Add a helper
to extract prefsrc from and update cxgbi_check_route6 to use it.

Fixes: 0153167aebd0 ("net/ipv6: Remove rt6i_prefsrc")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/scsi/cxgbi/libcxgbi.c |  5 ++---
 include/net/ip6_fib.h         | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 6b3ea50c594e..75f876409fb9 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -784,7 +784,8 @@ cxgbi_check_route6(struct sockaddr *dst_addr, int ifindex)
 	csk->mtu = mtu;
 	csk->dst = dst;
 
-	if (!rt->from || ipv6_addr_any(&rt->from->fib6_prefsrc.addr)) {
+	rt6_get_prefsrc(rt, &pref_saddr);
+	if (ipv6_addr_any(&pref_saddr)) {
 		struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt);
 
 		err = ipv6_dev_get_saddr(&init_net, idev ? idev->dev : NULL,
@@ -794,8 +795,6 @@ cxgbi_check_route6(struct sockaddr *dst_addr, int ifindex)
 				&daddr6->sin6_addr);
 			goto rel_rt;
 		}
-	} else {
-		pref_saddr = rt->from->fib6_prefsrc.addr;
 	}
 
 	csk->csk_family = AF_INET6;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c7496663f99a..f06e968f1992 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -412,6 +412,25 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack);
 int fib6_del(struct fib6_info *rt, struct nl_info *info);
 
+static inline
+void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
+{
+	const struct fib6_info *from;
+
+	rcu_read_lock();
+
+	from = rcu_dereference(rt->from);
+	if (from) {
+		*addr = from->fib6_prefsrc.addr;
+	} else {
+		struct in6_addr in6_zero = {};
+
+		*addr = in6_zero;
+	}
+
+	rcu_read_unlock();
+}
+
 static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 {
 	return f6i->fib6_nh.nh_dev;
-- 
cgit v1.2.3


From 41124fa64d4b298b82266b7ddbefc43540b77b44 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 12 Sep 2018 01:53:14 +0200
Subject: net: ethernet: Add helper to remove a supported link mode

Some MAC hardware cannot support a subset of link modes. e.g. often
1Gbps Full duplex is supported, but Half duplex is not. Add a helper
to remove such a link mode.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.c    |  6 +++---
 drivers/net/ethernet/cadence/macb_main.c          |  5 ++---
 drivers/net/ethernet/freescale/fec_main.c         |  3 ++-
 drivers/net/ethernet/microchip/lan743x_main.c     |  2 +-
 drivers/net/ethernet/renesas/ravb_main.c          |  3 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 12 ++++++++----
 drivers/net/phy/phy_device.c                      | 18 ++++++++++++++++++
 drivers/net/usb/lan78xx.c                         |  2 +-
 include/linux/phy.h                               |  1 +
 9 files changed, 38 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
index 078a04dc1182..4831f9de5945 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
@@ -895,9 +895,9 @@ int xgene_enet_phy_connect(struct net_device *ndev)
 	}
 
 	pdata->phy_speed = SPEED_UNKNOWN;
-	phy_dev->supported &= ~SUPPORTED_10baseT_Half &
-			      ~SUPPORTED_100baseT_Half &
-			      ~SUPPORTED_1000baseT_Half;
+	phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_10baseT_Half_BIT);
+	phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_100baseT_Half_BIT);
+	phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 	phy_dev->supported |= SUPPORTED_Pause |
 			      SUPPORTED_Asym_Pause;
 	phy_dev->advertising = phy_dev->supported;
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index bd4095c3a031..96ae8c992810 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -549,9 +549,8 @@ static int macb_mii_probe(struct net_device *dev)
 		phy_set_max_speed(phydev, SPEED_100);
 
 	if (bp->caps & MACB_CAPS_NO_GIGABIT_HALF)
-		phydev->supported &= ~SUPPORTED_1000baseT_Half;
-
-	phydev->advertising = phydev->supported;
+		phy_remove_link_mode(phydev,
+				     ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 
 	bp->link = 0;
 	bp->speed = 0;
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 5e849510c689..0c6fd77b6599 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -1947,7 +1947,8 @@ static int fec_enet_mii_probe(struct net_device *ndev)
 	/* mask with MAC supported features */
 	if (fep->quirks & FEC_QUIRK_HAS_GBIT) {
 		phy_set_max_speed(phy_dev, 1000);
-		phy_dev->supported &= ~SUPPORTED_1000baseT_Half;
+		phy_remove_link_mode(phy_dev,
+				     ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 #if !defined(CONFIG_M5272)
 		phy_dev->supported |= SUPPORTED_Pause;
 #endif
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c
index e7dce79ff2c9..048307959c01 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -1013,7 +1013,7 @@ static int lan743x_phy_open(struct lan743x_adapter *adapter)
 		goto return_error;
 
 	/* MAC doesn't support 1000T Half */
-	phydev->supported &= ~SUPPORTED_1000baseT_Half;
+	phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 
 	/* support both flow controls */
 	phy->fc_request_control = (FLOW_CTRL_RX | FLOW_CTRL_TX);
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index aff5516b781e..fb2a1125780d 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -1074,7 +1074,8 @@ static int ravb_phy_init(struct net_device *ndev)
 	}
 
 	/* 10BASE is not supported */
-	phydev->supported &= ~PHY_10BT_FEATURES;
+	phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_10baseT_Half_BIT);
+	phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_10baseT_Full_BIT);
 
 	phy_attached_info(phydev);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3d7aec7a050b..3715a0a4af3c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -993,10 +993,14 @@ static int stmmac_init_phy(struct net_device *dev)
 	 * Half-duplex mode not supported with multiqueue
 	 * half-duplex can only works with single queue
 	 */
-	if (tx_cnt > 1)
-		phydev->supported &= ~(SUPPORTED_1000baseT_Half |
-				       SUPPORTED_100baseT_Half |
-				       SUPPORTED_10baseT_Half);
+	if (tx_cnt > 1) {
+		phy_remove_link_mode(phydev,
+				     ETHTOOL_LINK_MODE_10baseT_Half_BIT);
+		phy_remove_link_mode(phydev,
+				     ETHTOOL_LINK_MODE_100baseT_Half_BIT);
+		phy_remove_link_mode(phydev,
+				     ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
+	}
 
 	/*
 	 * Broken HW is sometimes missing the pull-up resistor on the
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index db1172db1e7c..e9ca83a438b0 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1765,6 +1765,24 @@ int phy_set_max_speed(struct phy_device *phydev, u32 max_speed)
 }
 EXPORT_SYMBOL(phy_set_max_speed);
 
+/**
+ * phy_remove_link_mode - Remove a supported link mode
+ * @phydev: phy_device structure to remove link mode from
+ * @link_mode: Link mode to be removed
+ *
+ * Description: Some MACs don't support all link modes which the PHY
+ * does.  e.g. a 1G MAC often does not support 1000Half. Add a helper
+ * to remove a link mode.
+ */
+void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode)
+{
+	WARN_ON(link_mode > 31);
+
+	phydev->supported &= ~BIT(link_mode);
+	phydev->advertising = phydev->supported;
+}
+EXPORT_SYMBOL(phy_remove_link_mode);
+
 static void of_set_phy_supported(struct phy_device *phydev)
 {
 	struct device_node *node = phydev->mdio.dev.of_node;
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 3ce3c66559e4..95a98a20b2e3 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -2166,7 +2166,7 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 	}
 
 	/* MAC doesn't support 1000T Half */
-	phydev->supported &= ~SUPPORTED_1000baseT_Half;
+	phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 
 	/* support both flow controls */
 	dev->fc_request_control = (FLOW_CTRL_RX | FLOW_CTRL_TX);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index cd6f637cbbfb..9c4c3eca8cf2 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1049,6 +1049,7 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd);
 int phy_start_interrupts(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
 int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
+void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
 
 int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask,
 		       int (*run)(struct phy_device *));
-- 
cgit v1.2.3


From af8d9bb2f2f405ad541794b46f9d7bc70f13e5cb Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 12 Sep 2018 01:53:15 +0200
Subject: net: ethernet: Add helper for MACs which support asym pause

Rather than have the MAC drivers manipulate phydev members to indicate
they support Asym Pause, add a helper function.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c             |  4 ++--
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.c          |  4 +---
 drivers/net/ethernet/broadcom/sb1250-mac.c              |  5 +----
 drivers/net/ethernet/broadcom/tg3.c                     |  8 ++------
 drivers/net/ethernet/cortina/gemini.c                   |  3 +--
 drivers/net/ethernet/dnet.c                             |  4 +---
 drivers/net/ethernet/faraday/ftgmac100.c                |  3 +--
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c          |  3 +--
 drivers/net/ethernet/freescale/gianfar.c                |  4 ++--
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c |  4 +---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c             |  6 +-----
 drivers/net/ethernet/microchip/lan743x_main.c           |  5 +----
 drivers/net/ethernet/smsc/smsc911x.c                    |  3 +--
 drivers/net/ethernet/smsc/smsc9420.c                    |  3 +--
 drivers/net/ethernet/socionext/sni_ave.c                |  3 ++-
 drivers/net/phy/phy_device.c                            | 13 +++++++++++++
 include/linux/phy.h                                     |  1 +
 17 files changed, 33 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 3ceb4f95ca7c..289129011b9f 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -879,8 +879,8 @@ static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata)
 	phy_write(phy_data->phydev, 0x00, 0x9140);
 
 	phy_data->phydev->supported = PHY_GBIT_FEATURES;
-	phy_data->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
 	phy_data->phydev->advertising = phy_data->phydev->supported;
+	phy_support_asym_pause(phy_data->phydev);
 
 	netif_dbg(pdata, drv, pdata->netdev,
 		  "Finisar PHY quirk in place\n");
@@ -951,8 +951,8 @@ static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata)
 	phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
 
 	phy_data->phydev->supported = PHY_GBIT_FEATURES;
-	phy_data->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
 	phy_data->phydev->advertising = phy_data->phydev->supported;
+	phy_support_asym_pause(phy_data->phydev);
 
 	netif_dbg(pdata, drv, pdata->netdev,
 		  "BelFuse PHY quirk in place\n");
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
index 4831f9de5945..e3560311711a 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
@@ -898,9 +898,7 @@ int xgene_enet_phy_connect(struct net_device *ndev)
 	phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_10baseT_Half_BIT);
 	phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_100baseT_Half_BIT);
 	phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
-	phy_dev->supported |= SUPPORTED_Pause |
-			      SUPPORTED_Asym_Pause;
-	phy_dev->advertising = phy_dev->supported;
+	phy_support_asym_pause(phy_dev);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/broadcom/sb1250-mac.c b/drivers/net/ethernet/broadcom/sb1250-mac.c
index 4ce4b097ec05..53acbbb36637 100644
--- a/drivers/net/ethernet/broadcom/sb1250-mac.c
+++ b/drivers/net/ethernet/broadcom/sb1250-mac.c
@@ -2358,13 +2358,10 @@ static int sbmac_mii_probe(struct net_device *dev)
 
 	/* Remove any features not supported by the controller */
 	phy_set_max_speed(phy_dev, SPEED_1000);
-	phy_dev->supported |= SUPPORTED_Pause |
-			      SUPPORTED_Asym_Pause;
+	phy_support_asym_pause(phy_dev);
 
 	phy_attached_info(phy_dev);
 
-	phy_dev->advertising = phy_dev->supported;
-
 	sc->phy_dev = phy_dev;
 
 	return 0;
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index eab00239a47a..193e990fac7a 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -2123,15 +2123,13 @@ static int tg3_phy_init(struct tg3 *tp)
 	case PHY_INTERFACE_MODE_RGMII:
 		if (!(tp->phy_flags & TG3_PHYFLG_10_100_ONLY)) {
 			phy_set_max_speed(phydev, SPEED_1000);
-			phydev->supported |= (SUPPORTED_Pause |
-					      SUPPORTED_Asym_Pause);
+			phy_support_asym_pause(phydev);
 			break;
 		}
 		/* fallthru */
 	case PHY_INTERFACE_MODE_MII:
 		phy_set_max_speed(phydev, SPEED_100);
-		phydev->supported |= (SUPPORTED_Pause |
-				      SUPPORTED_Asym_Pause);
+		phy_support_asym_pause(phydev);
 		break;
 	default:
 		phy_disconnect(mdiobus_get_phy(tp->mdio_bus, tp->phy_addr));
@@ -2140,8 +2138,6 @@ static int tg3_phy_init(struct tg3 *tp)
 
 	tp->phy_flags |= TG3_PHYFLG_IS_CONNECTED;
 
-	phydev->advertising = phydev->supported;
-
 	phy_attached_info(phydev);
 
 	return 0;
diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
index 2b46c0de90d0..ceec467f590d 100644
--- a/drivers/net/ethernet/cortina/gemini.c
+++ b/drivers/net/ethernet/cortina/gemini.c
@@ -373,8 +373,7 @@ static int gmac_setup_phy(struct net_device *netdev)
 	netdev->phydev = phy;
 
 	phy_set_max_speed(phy, SPEED_1000);
-	phy->supported |= SUPPORTED_Asym_Pause | SUPPORTED_Pause;
-	phy->advertising = phy->supported;
+	phy_support_asym_pause(phy);
 
 	/* set PHY interface type */
 	switch (phy->interface) {
diff --git a/drivers/net/ethernet/dnet.c b/drivers/net/ethernet/dnet.c
index 08b7ad1594ce..79521e27f0d1 100644
--- a/drivers/net/ethernet/dnet.c
+++ b/drivers/net/ethernet/dnet.c
@@ -288,9 +288,7 @@ static int dnet_mii_probe(struct net_device *dev)
 	else
 		phy_set_max_speed(phydev, SPEED_100);
 
-	phydev->supported |= SUPPORTED_Asym_Pause | SUPPORTED_Pause;
-
-	phydev->advertising = phydev->supported;
+	phy_support_asym_pause(phydev);
 
 	bp->link = 0;
 	bp->speed = 0;
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index ed6c76d20b45..3f319ee66ab4 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1079,8 +1079,7 @@ static int ftgmac100_mii_probe(struct ftgmac100 *priv, phy_interface_t intf)
 	/* Indicate that we support PAUSE frames (see comment in
 	 * Documentation/networking/phy.txt)
 	 */
-	phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
-	phydev->advertising = phydev->supported;
+	phy_support_asym_pause(phydev);
 
 	/* Display what we found */
 	phy_attached_info(phydev);
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 783134f1b779..a5131a510e8b 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -2491,8 +2491,7 @@ static int dpaa_phy_init(struct net_device *net_dev)
 
 	/* Remove any features not supported by the controller */
 	phy_dev->supported &= mac_dev->if_support;
-	phy_dev->supported |= (SUPPORTED_Pause | SUPPORTED_Asym_Pause);
-	phy_dev->advertising = phy_dev->supported;
+	phy_support_asym_pause(phy_dev);
 
 	mac_dev->phy_dev = phy_dev;
 	net_dev->phydev = phy_dev;
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index f27f9bae1a4a..40a1a87cd338 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -1814,8 +1814,8 @@ static int init_phy(struct net_device *dev)
 	phydev->supported &= (GFAR_SUPPORTED | gigabit_support);
 	phydev->advertising = phydev->supported;
 
-	/* Add support for flow control, but don't advertise it by default */
-	phydev->supported |= (SUPPORTED_Pause | SUPPORTED_Asym_Pause);
+	/* Add support for flow control */
+	phy_support_asym_pause(phydev);
 
 	/* disable EEE autoneg, EEE not supported by eTSEC */
 	memset(&edata, 0, sizeof(struct ethtool_eee));
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
index 05b15d254e32..24b1f2a0c32a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
@@ -211,9 +211,7 @@ int hclge_mac_connect_phy(struct hclge_dev *hdev)
 	}
 
 	phydev->supported &= HCLGE_PHY_SUPPORTED_FEATURES;
-	phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
-
-	phydev->advertising = phydev->supported;
+	phy_support_asym_pause(phydev);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index db231bda7c2a..cc1e9a96a43b 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -355,12 +355,8 @@ static int mtk_phy_connect(struct net_device *dev)
 	dev->phydev->speed = 0;
 	dev->phydev->duplex = 0;
 
-	if (of_phy_is_fixed_link(mac->of_node))
-		dev->phydev->supported |=
-		SUPPORTED_Pause | SUPPORTED_Asym_Pause;
-
 	phy_set_max_speed(dev->phydev, SPEED_1000);
-	dev->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+	phy_support_asym_pause(dev->phydev);
 	dev->phydev->advertising = dev->phydev->supported |
 				    ADVERTISED_Autoneg;
 	phy_start_aneg(dev->phydev);
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c
index 048307959c01..b1a0e657febf 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -999,7 +999,6 @@ static int lan743x_phy_open(struct lan743x_adapter *adapter)
 	struct phy_device *phydev;
 	struct net_device *netdev;
 	int ret = -EIO;
-	u32 mii_adv;
 
 	netdev = adapter->netdev;
 	phydev = phy_find_first(adapter->mdiobus);
@@ -1016,10 +1015,8 @@ static int lan743x_phy_open(struct lan743x_adapter *adapter)
 	phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 
 	/* support both flow controls */
+	phy_support_asym_pause(phydev);
 	phy->fc_request_control = (FLOW_CTRL_RX | FLOW_CTRL_TX);
-	phydev->advertising &= ~(ADVERTISED_Pause | ADVERTISED_Asym_Pause);
-	mii_adv = (u32)mii_advertise_flowctrl(phy->fc_request_control);
-	phydev->advertising |= mii_adv_to_ethtool_adv_t(mii_adv);
 	phy->fc_autoneg = phydev->autoneg;
 
 	phy_start(phydev);
diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index 3e34bf53f055..c009407618d9 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -1051,8 +1051,7 @@ static int smsc911x_mii_probe(struct net_device *dev)
 	phy_set_max_speed(phydev, SPEED_100);
 
 	/* mask with MAC supported features */
-	phydev->supported |= (SUPPORTED_Pause | SUPPORTED_Asym_Pause);
-	phydev->advertising = phydev->supported;
+	phy_support_asym_pause(phydev);
 
 	pdata->last_duplex = -1;
 	pdata->last_carrier = -1;
diff --git a/drivers/net/ethernet/smsc/smsc9420.c b/drivers/net/ethernet/smsc/smsc9420.c
index 326177384544..9b6366b20110 100644
--- a/drivers/net/ethernet/smsc/smsc9420.c
+++ b/drivers/net/ethernet/smsc/smsc9420.c
@@ -1138,8 +1138,7 @@ static int smsc9420_mii_probe(struct net_device *dev)
 	phy_set_max_speed(phydev, SPEED_100);
 
 	/* mask with MAC supported features */
-	phydev->supported |= (SUPPORTED_Pause | SUPPORTED_Asym_Pause);
-	phydev->advertising = phydev->supported;
+	phy_support_asym_pause(phydev);
 
 	phy_attached_info(phydev);
 
diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index 76ff364c40e9..a50720ec109c 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -1225,7 +1225,8 @@ static int ave_init(struct net_device *ndev)
 
 	if (!phy_interface_is_rgmii(phydev))
 		phy_set_max_speed(phydev, SPEED_100);
-	phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+
+	phy_support_asym_pause(phydev);
 
 	phy_attached_info(phydev);
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index e9ca83a438b0..a0646a66f005 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1783,6 +1783,19 @@ void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode)
 }
 EXPORT_SYMBOL(phy_remove_link_mode);
 
+/**
+ * phy_support_asym_pause - Enable support of asym pause
+ * @phydev: target phy_device struct
+ *
+ * Description: Called by the MAC to indicate is supports Asym Pause.
+ */
+void phy_support_asym_pause(struct phy_device *phydev)
+{
+	phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+	phydev->advertising = phydev->supported;
+}
+EXPORT_SYMBOL(phy_support_asym_pause);
+
 static void of_set_phy_supported(struct phy_device *phydev)
 {
 	struct device_node *node = phydev->mdio.dev.of_node;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9c4c3eca8cf2..e2db819807c1 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1050,6 +1050,7 @@ int phy_start_interrupts(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
 int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
+void phy_support_asym_pause(struct phy_device *phydev);
 
 int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask,
 		       int (*run)(struct phy_device *));
-- 
cgit v1.2.3


From c306ad36184fb7d0bd53f45441f45c1810e88a53 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 12 Sep 2018 01:53:16 +0200
Subject: net: ethernet: Add helper for MACs which support pause

Rather than have the MAC drivers manipulate phydev members, add a
helper function for MACs supporting Pause, but not Asym Pause.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c |  2 +-
 drivers/net/ethernet/freescale/fec_main.c    |  4 +---
 drivers/net/phy/phy_device.c                 | 14 ++++++++++++++
 include/linux/phy.h                          |  1 +
 4 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 2eee9459c2cf..9f25667c38e6 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -890,7 +890,7 @@ static int bcm_enet_open(struct net_device *dev)
 		}
 
 		/* mask with MAC supported features */
-		phydev->supported |= SUPPORTED_Pause;
+		phy_support_sym_pause(phydev);
 		phy_set_max_speed(phydev, SPEED_100);
 
 		if (priv->pause_auto && priv->pause_rx && priv->pause_tx)
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 0c6fd77b6599..05ce0903391a 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -1950,14 +1950,12 @@ static int fec_enet_mii_probe(struct net_device *ndev)
 		phy_remove_link_mode(phy_dev,
 				     ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 #if !defined(CONFIG_M5272)
-		phy_dev->supported |= SUPPORTED_Pause;
+		phy_support_sym_pause(phy_dev);
 #endif
 	}
 	else
 		phy_set_max_speed(phy_dev, 100);
 
-	phy_dev->advertising = phy_dev->supported;
-
 	fep->link = 0;
 	fep->full_duplex = 0;
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index a0646a66f005..e657d5ae2ab8 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1783,6 +1783,20 @@ void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode)
 }
 EXPORT_SYMBOL(phy_remove_link_mode);
 
+/**
+ * phy_support_sym_pause - Enable support of symmetrical pause
+ * @phydev: target phy_device struct
+ *
+ * Description: Called by the MAC to indicate is supports symmetrical
+ * Pause, but not asym pause.
+ */
+void phy_support_sym_pause(struct phy_device *phydev)
+{
+	phydev->supported |= SUPPORTED_Pause;
+	phydev->advertising = phydev->supported;
+}
+EXPORT_SYMBOL(phy_support_sym_pause);
+
 /**
  * phy_support_asym_pause - Enable support of asym pause
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e2db819807c1..bc5d6c3f1388 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1050,6 +1050,7 @@ int phy_start_interrupts(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
 int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
+void phy_support_sym_pause(struct phy_device *phydev);
 void phy_support_asym_pause(struct phy_device *phydev);
 
 int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask,
-- 
cgit v1.2.3


From 70814e819c1139e5e7faacb3700eab5eac559272 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 12 Sep 2018 01:53:17 +0200
Subject: net: ethernet: Add helper for set_pauseparam for Asym Pause

ethtool can be used to enable/disable pause. Add a helper to configure
the PHY when asym pause is supported.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/apm/xgene/xgene_enet_ethtool.c    | 26 ++----------
 drivers/net/ethernet/aurora/nb8800.c               |  9 +---
 drivers/net/ethernet/broadcom/tg3.c                | 43 ++++++-------------
 drivers/net/ethernet/faraday/ftgmac100.c           | 17 ++------
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 23 +---------
 drivers/net/ethernet/freescale/gianfar_ethtool.c   | 49 +++++++---------------
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    |  8 +---
 drivers/net/ethernet/socionext/sni_ave.c           | 11 +----
 drivers/net/phy/phy_device.c                       | 30 +++++++++++++
 include/linux/phy.h                                |  1 +
 10 files changed, 69 insertions(+), 148 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
index 4f50f11718f4..dfe03afd00b0 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
@@ -306,7 +306,6 @@ static int xgene_set_pauseparam(struct net_device *ndev,
 {
 	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
 	struct phy_device *phydev = ndev->phydev;
-	u32 oldadv, newadv;
 
 	if (phy_interface_mode_is_rgmii(pdata->phy_mode) ||
 	    pdata->phy_mode == PHY_INTERFACE_MODE_SGMII) {
@@ -322,29 +321,12 @@ static int xgene_set_pauseparam(struct net_device *ndev,
 		pdata->tx_pause = pp->tx_pause;
 		pdata->rx_pause = pp->rx_pause;
 
-		oldadv = phydev->advertising;
-		newadv = oldadv & ~(ADVERTISED_Pause | ADVERTISED_Asym_Pause);
+		phy_set_asym_pause(phydev, pp->rx_pause,  pp->tx_pause);
 
-		if (pp->rx_pause)
-			newadv |= ADVERTISED_Pause | ADVERTISED_Asym_Pause;
-
-		if (pp->tx_pause)
-			newadv ^= ADVERTISED_Asym_Pause;
-
-		if (oldadv ^ newadv) {
-			phydev->advertising = newadv;
-
-			if (phydev->autoneg)
-				return phy_start_aneg(phydev);
-
-			if (!pp->autoneg) {
-				pdata->mac_ops->flowctl_tx(pdata,
-							   pdata->tx_pause);
-				pdata->mac_ops->flowctl_rx(pdata,
-							   pdata->rx_pause);
-			}
+		if (!pp->autoneg) {
+			pdata->mac_ops->flowctl_tx(pdata, pdata->tx_pause);
+			pdata->mac_ops->flowctl_rx(pdata, pdata->rx_pause);
 		}
-
 	} else {
 		if (pp->autoneg)
 			return -EINVAL;
diff --git a/drivers/net/ethernet/aurora/nb8800.c b/drivers/net/ethernet/aurora/nb8800.c
index c8d1f8fa4713..6f56276015a4 100644
--- a/drivers/net/ethernet/aurora/nb8800.c
+++ b/drivers/net/ethernet/aurora/nb8800.c
@@ -935,18 +935,11 @@ static void nb8800_pause_adv(struct net_device *dev)
 {
 	struct nb8800_priv *priv = netdev_priv(dev);
 	struct phy_device *phydev = dev->phydev;
-	u32 adv = 0;
 
 	if (!phydev)
 		return;
 
-	if (priv->pause_rx)
-		adv |= ADVERTISED_Pause | ADVERTISED_Asym_Pause;
-	if (priv->pause_tx)
-		adv ^= ADVERTISED_Asym_Pause;
-
-	phydev->supported |= adv;
-	phydev->advertising |= adv;
+	phy_set_asym_pause(phydev, priv->pause_rx, priv->pause_tx);
 }
 
 static int nb8800_open(struct net_device *dev)
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 193e990fac7a..b2a3d008e1df 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -12492,7 +12492,6 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam
 		tg3_warn_mgmt_link_flap(tp);
 
 	if (tg3_flag(tp, USE_PHYLIB)) {
-		u32 newadv;
 		struct phy_device *phydev;
 
 		phydev = mdiobus_get_phy(tp->mdio_bus, tp->phy_addr);
@@ -12503,20 +12502,16 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam
 			return -EINVAL;
 
 		tp->link_config.flowctrl = 0;
+		phy_set_asym_pause(phydev, epause->rx_pause, epause->tx_pause);
 		if (epause->rx_pause) {
 			tp->link_config.flowctrl |= FLOW_CTRL_RX;
 
 			if (epause->tx_pause) {
 				tp->link_config.flowctrl |= FLOW_CTRL_TX;
-				newadv = ADVERTISED_Pause;
-			} else
-				newadv = ADVERTISED_Pause |
-					 ADVERTISED_Asym_Pause;
+			}
 		} else if (epause->tx_pause) {
 			tp->link_config.flowctrl |= FLOW_CTRL_TX;
-			newadv = ADVERTISED_Asym_Pause;
-		} else
-			newadv = 0;
+		}
 
 		if (epause->autoneg)
 			tg3_flag_set(tp, PAUSE_AUTONEG);
@@ -12524,33 +12519,19 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam
 			tg3_flag_clear(tp, PAUSE_AUTONEG);
 
 		if (tp->phy_flags & TG3_PHYFLG_IS_CONNECTED) {
-			u32 oldadv = phydev->advertising &
-				     (ADVERTISED_Pause | ADVERTISED_Asym_Pause);
-			if (oldadv != newadv) {
-				phydev->advertising &=
-					~(ADVERTISED_Pause |
-					  ADVERTISED_Asym_Pause);
-				phydev->advertising |= newadv;
-				if (phydev->autoneg) {
-					/*
-					 * Always renegotiate the link to
-					 * inform our link partner of our
-					 * flow control settings, even if the
-					 * flow control is forced.  Let
-					 * tg3_adjust_link() do the final
-					 * flow control setup.
-					 */
-					return phy_start_aneg(phydev);
-				}
+			if (phydev->autoneg) {
+				/* phy_set_asym_pause() will
+				 * renegotiate the link to inform our
+				 * link partner of our flow control
+				 * settings, even if the flow control
+				 * is forced.  Let tg3_adjust_link()
+				 * do the final flow control setup.
+				 */
+				return 0;
 			}
 
 			if (!epause->autoneg)
 				tg3_setup_flow_control(tp, 0, 0);
-		} else {
-			tp->link_config.advertising &=
-					~(ADVERTISED_Pause |
-					  ADVERTISED_Asym_Pause);
-			tp->link_config.advertising |= newadv;
 		}
 	} else {
 		int irq_sync = 0;
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 3f319ee66ab4..d8ead7e4177e 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1219,22 +1219,11 @@ static int ftgmac100_set_pauseparam(struct net_device *netdev,
 	priv->tx_pause = pause->tx_pause;
 	priv->rx_pause = pause->rx_pause;
 
-	if (phydev) {
-		phydev->advertising &= ~ADVERTISED_Pause;
-		phydev->advertising &= ~ADVERTISED_Asym_Pause;
+	if (phydev)
+		phy_set_asym_pause(phydev, pause->rx_pause, pause->tx_pause);
 
-		if (pause->rx_pause) {
-			phydev->advertising |= ADVERTISED_Pause;
-			phydev->advertising |= ADVERTISED_Asym_Pause;
-		}
-
-		if (pause->tx_pause)
-			phydev->advertising ^= ADVERTISED_Asym_Pause;
-	}
 	if (netif_running(netdev)) {
-		if (phydev && priv->aneg_pause)
-			phy_start_aneg(phydev);
-		else
+		if (!(phydev && priv->aneg_pause))
 			ftgmac100_config_pause(priv);
 	}
 
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
index 3184c8f7cdd0..1f8cdbc4378c 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -210,29 +210,8 @@ static int dpaa_set_pauseparam(struct net_device *net_dev,
 	/* Determine the sym/asym advertised PAUSE capabilities from the desired
 	 * rx/tx pause settings.
 	 */
-	newadv = 0;
-	if (epause->rx_pause)
-		newadv = ADVERTISED_Pause | ADVERTISED_Asym_Pause;
-	if (epause->tx_pause)
-		newadv ^= ADVERTISED_Asym_Pause;
 
-	oldadv = phydev->advertising &
-			(ADVERTISED_Pause | ADVERTISED_Asym_Pause);
-
-	/* If there are differences between the old and the new advertised
-	 * values, restart PHY autonegotiation and advertise the new values.
-	 */
-	if (oldadv != newadv) {
-		phydev->advertising &= ~(ADVERTISED_Pause
-				| ADVERTISED_Asym_Pause);
-		phydev->advertising |= newadv;
-		if (phydev->autoneg) {
-			err = phy_start_aneg(phydev);
-			if (err < 0)
-				netdev_err(net_dev, "phy_start_aneg() = %d\n",
-					   err);
-		}
-	}
+	phy_set_asym_pause(phydev, epause->rx_pause, epause->tx_pause);
 
 	fman_get_pause_cfg(mac_dev, &rx_pause, &tx_pause);
 	err = fman_set_mac_active_pause(mac_dev, rx_pause, tx_pause);
diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index 395a5266ea30..3545e8f715f2 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -503,7 +503,6 @@ static int gfar_spauseparam(struct net_device *dev,
 	struct gfar_private *priv = netdev_priv(dev);
 	struct phy_device *phydev = dev->phydev;
 	struct gfar __iomem *regs = priv->gfargrp[0].regs;
-	u32 oldadv, newadv;
 
 	if (!phydev)
 		return -ENODEV;
@@ -514,54 +513,36 @@ static int gfar_spauseparam(struct net_device *dev,
 		return -EINVAL;
 
 	priv->rx_pause_en = priv->tx_pause_en = 0;
+	phy_set_asym_pause(phydev, epause->rx_pause, epause->tx_pause);
 	if (epause->rx_pause) {
 		priv->rx_pause_en = 1;
 
 		if (epause->tx_pause) {
 			priv->tx_pause_en = 1;
-			/* FLOW_CTRL_RX & TX */
-			newadv = ADVERTISED_Pause;
-		} else  /* FLOW_CTLR_RX */
-			newadv = ADVERTISED_Pause | ADVERTISED_Asym_Pause;
+		}
 	} else if (epause->tx_pause) {
 		priv->tx_pause_en = 1;
-		/* FLOW_CTLR_TX */
-		newadv = ADVERTISED_Asym_Pause;
-	} else
-		newadv = 0;
+	}
 
 	if (epause->autoneg)
 		priv->pause_aneg_en = 1;
 	else
 		priv->pause_aneg_en = 0;
 
-	oldadv = phydev->advertising &
-		(ADVERTISED_Pause | ADVERTISED_Asym_Pause);
-	if (oldadv != newadv) {
-		phydev->advertising &=
-			~(ADVERTISED_Pause | ADVERTISED_Asym_Pause);
-		phydev->advertising |= newadv;
-		if (phydev->autoneg)
-			/* inform link partner of our
-			 * new flow ctrl settings
-			 */
-			return phy_start_aneg(phydev);
-
-		if (!epause->autoneg) {
-			u32 tempval;
-			tempval = gfar_read(&regs->maccfg1);
-			tempval &= ~(MACCFG1_TX_FLOW | MACCFG1_RX_FLOW);
-
-			priv->tx_actual_en = 0;
-			if (priv->tx_pause_en) {
-				priv->tx_actual_en = 1;
-				tempval |= MACCFG1_TX_FLOW;
-			}
+	if (!epause->autoneg) {
+		u32 tempval = gfar_read(&regs->maccfg1);
 
-			if (priv->rx_pause_en)
-				tempval |= MACCFG1_RX_FLOW;
-			gfar_write(&regs->maccfg1, tempval);
+		tempval &= ~(MACCFG1_TX_FLOW | MACCFG1_RX_FLOW);
+
+		priv->tx_actual_en = 0;
+		if (priv->tx_pause_en) {
+			priv->tx_actual_en = 1;
+			tempval |= MACCFG1_TX_FLOW;
 		}
+
+		if (priv->rx_pause_en)
+			tempval |= MACCFG1_RX_FLOW;
+		gfar_write(&regs->maccfg1, tempval);
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index c56db06b63e0..cf18608669f5 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5228,13 +5228,7 @@ static void hclge_set_flowctrl_adv(struct hclge_dev *hdev, u32 rx_en, u32 tx_en)
 	if (!phydev)
 		return;
 
-	phydev->advertising &= ~(ADVERTISED_Pause | ADVERTISED_Asym_Pause);
-
-	if (rx_en)
-		phydev->advertising |= ADVERTISED_Pause | ADVERTISED_Asym_Pause;
-
-	if (tx_en)
-		phydev->advertising ^= ADVERTISED_Asym_Pause;
+	phy_set_asym_pause(phydev, rx_en, tx_en);
 }
 
 static int hclge_cfg_pauseparam(struct hclge_dev *hdev, u32 rx_en, u32 tx_en)
diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index a50720ec109c..61e6abb966ac 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -461,16 +461,7 @@ static int ave_ethtool_set_pauseparam(struct net_device *ndev,
 	priv->pause_rx   = pause->rx_pause;
 	priv->pause_tx   = pause->tx_pause;
 
-	phydev->advertising &= ~(ADVERTISED_Pause | ADVERTISED_Asym_Pause);
-	if (pause->rx_pause)
-		phydev->advertising |= ADVERTISED_Pause | ADVERTISED_Asym_Pause;
-	if (pause->tx_pause)
-		phydev->advertising ^= ADVERTISED_Asym_Pause;
-
-	if (pause->autoneg) {
-		if (netif_running(ndev))
-			phy_start_aneg(phydev);
-	}
+	phy_set_asym_pause(phydev, pause->rx_pause, pause->tx_pause);
 
 	return 0;
 }
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index e657d5ae2ab8..5732d89c8e37 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1810,6 +1810,36 @@ void phy_support_asym_pause(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_support_asym_pause);
 
+/**
+ * phy_set_asym_pause - Configure Pause and Asym Pause
+ * @phydev: target phy_device struct
+ * @rx: Receiver Pause is supported
+ * @tx: Transmit Pause is supported
+ *
+ * Description: Configure advertised Pause support depending on if
+ * transmit and receiver pause is supported. If there has been a
+ * change in adverting, trigger a new autoneg. Generally called from
+ * the set_pauseparam .ndo.
+ */
+void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx)
+{
+	u16 oldadv = phydev->advertising;
+	u16 newadv = oldadv &= ~(SUPPORTED_Pause | SUPPORTED_Asym_Pause);
+
+	if (rx)
+		newadv |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+	if (tx)
+		newadv ^= SUPPORTED_Asym_Pause;
+
+	if (oldadv != newadv) {
+		phydev->advertising = newadv;
+
+		if (phydev->autoneg)
+			phy_start_aneg(phydev);
+	}
+}
+EXPORT_SYMBOL(phy_set_asym_pause);
+
 static void of_set_phy_supported(struct phy_device *phydev)
 {
 	struct device_node *node = phydev->mdio.dev.of_node;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index bc5d6c3f1388..e4062ba7472f 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1052,6 +1052,7 @@ int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
 void phy_support_sym_pause(struct phy_device *phydev);
 void phy_support_asym_pause(struct phy_device *phydev);
+void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx);
 
 int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask,
 		       int (*run)(struct phy_device *));
-- 
cgit v1.2.3


From 0c122405d4c3ec638ba00865c872ec5a3ed1a6c0 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 12 Sep 2018 01:53:18 +0200
Subject: net: ethernet: Add helper for set_pauseparam for Pause

ethtool can be used to enable/disable pause. Add a helper to configure
the PHY when Pause is supported.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c |  7 ++-----
 drivers/net/ethernet/freescale/fec_main.c    |  9 ++-------
 drivers/net/phy/phy_device.c                 | 23 +++++++++++++++++++++++
 include/linux/phy.h                          |  2 ++
 4 files changed, 29 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 9f25667c38e6..02e7dfc1a2ef 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -892,11 +892,8 @@ static int bcm_enet_open(struct net_device *dev)
 		/* mask with MAC supported features */
 		phy_support_sym_pause(phydev);
 		phy_set_max_speed(phydev, SPEED_100);
-
-		if (priv->pause_auto && priv->pause_rx && priv->pause_tx)
-			phydev->advertising |= SUPPORTED_Pause;
-		else
-			phydev->advertising &= ~SUPPORTED_Pause;
+		phy_set_sym_pause(phydev, priv->pause_rx, priv->pause_rx,
+				  priv->pause_auto);
 
 		phy_attached_info(phydev);
 
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 05ce0903391a..2e0bb90131b6 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -2229,13 +2229,8 @@ static int fec_enet_set_pauseparam(struct net_device *ndev,
 	fep->pause_flag |= pause->rx_pause ? FEC_PAUSE_FLAG_ENABLE : 0;
 	fep->pause_flag |= pause->autoneg ? FEC_PAUSE_FLAG_AUTONEG : 0;
 
-	if (pause->rx_pause || pause->autoneg) {
-		ndev->phydev->supported |= ADVERTISED_Pause;
-		ndev->phydev->advertising |= ADVERTISED_Pause;
-	} else {
-		ndev->phydev->supported &= ~ADVERTISED_Pause;
-		ndev->phydev->advertising &= ~ADVERTISED_Pause;
-	}
+	phy_set_sym_pause(ndev->phydev, pause->rx_pause, pause->tx_pause,
+			  pause->autoneg);
 
 	if (pause->autoneg) {
 		if (netif_running(ndev))
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 5732d89c8e37..de95f1e072e9 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1810,6 +1810,29 @@ void phy_support_asym_pause(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_support_asym_pause);
 
+/**
+ * phy_set_sym_pause - Configure symmetric Pause
+ * @phydev: target phy_device struct
+ * @rx: Receiver Pause is supported
+ * @tx: Transmit Pause is supported
+ * @autoneg: Auto neg should be used
+ *
+ * Description: Configure advertised Pause support depending on if
+ * receiver pause and pause auto neg is supported. Generally called
+ * from the set_pauseparam .ndo.
+ */
+void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx,
+		       bool autoneg)
+{
+	phydev->supported &= ~SUPPORTED_Pause;
+
+	if (rx && tx && autoneg)
+		phydev->supported |= SUPPORTED_Pause;
+
+	phydev->advertising = phydev->supported;
+}
+EXPORT_SYMBOL(phy_set_sym_pause);
+
 /**
  * phy_set_asym_pause - Configure Pause and Asym Pause
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e4062ba7472f..8521391ebb20 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1052,6 +1052,8 @@ int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
 void phy_support_sym_pause(struct phy_device *phydev);
 void phy_support_asym_pause(struct phy_device *phydev);
+void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx,
+		       bool autoneg);
 void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx);
 
 int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask,
-- 
cgit v1.2.3


From 22b7d29926b577ff4f480611380d03268545b787 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 12 Sep 2018 01:53:19 +0200
Subject: net: ethernet: Add helper to determine if pause configuration is
 supported

Rather than have MAC drivers open code the test, add a helper in
phylib. This will help when we change the type of phydev->supported.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c |  4 +---
 drivers/net/ethernet/broadcom/tg3.c                 |  4 +---
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c  |  4 +---
 drivers/net/ethernet/freescale/gianfar_ethtool.c    |  4 +---
 drivers/net/phy/phy_device.c                        | 20 ++++++++++++++++++++
 include/linux/phy.h                                 |  2 ++
 6 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
index dfe03afd00b0..78dd09b5beeb 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
@@ -312,9 +312,7 @@ static int xgene_set_pauseparam(struct net_device *ndev,
 		if (!phydev)
 			return -EINVAL;
 
-		if (!(phydev->supported & SUPPORTED_Pause) ||
-		    (!(phydev->supported & SUPPORTED_Asym_Pause) &&
-		     pp->rx_pause != pp->tx_pause))
+		if (!phy_validate_pause(phydev, pp))
 			return -EINVAL;
 
 		pdata->pause_autoneg = pp->autoneg;
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index b2a3d008e1df..fb0e458e25b7 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -12496,9 +12496,7 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam
 
 		phydev = mdiobus_get_phy(tp->mdio_bus, tp->phy_addr);
 
-		if (!(phydev->supported & SUPPORTED_Pause) ||
-		    (!(phydev->supported & SUPPORTED_Asym_Pause) &&
-		     (epause->rx_pause != epause->tx_pause)))
+		if (!phy_validate_pause(phydev, epause))
 			return -EINVAL;
 
 		tp->link_config.flowctrl = 0;
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
index 1f8cdbc4378c..5d0fdf667b82 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -194,9 +194,7 @@ static int dpaa_set_pauseparam(struct net_device *net_dev,
 		return -ENODEV;
 	}
 
-	if (!(phydev->supported & SUPPORTED_Pause) ||
-	    (!(phydev->supported & SUPPORTED_Asym_Pause) &&
-	    (epause->rx_pause != epause->tx_pause)))
+	if (!phy_validate_pause(phydev, epause))
 		return -EINVAL;
 
 	/* The MAC should know how to handle PAUSE frame autonegotiation before
diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index 3545e8f715f2..d3662965f59d 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -507,9 +507,7 @@ static int gfar_spauseparam(struct net_device *dev,
 	if (!phydev)
 		return -ENODEV;
 
-	if (!(phydev->supported & SUPPORTED_Pause) ||
-	    (!(phydev->supported & SUPPORTED_Asym_Pause) &&
-	     (epause->rx_pause != epause->tx_pause)))
+	if (!phy_validate_pause(phydev, epause))
 		return -EINVAL;
 
 	priv->rx_pause_en = priv->tx_pause_en = 0;
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index de95f1e072e9..af64a9320fb0 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1863,6 +1863,26 @@ void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx)
 }
 EXPORT_SYMBOL(phy_set_asym_pause);
 
+/**
+ * phy_validate_pause - Test if the PHY/MAC support the pause configuration
+ * @phydev: phy_device struct
+ * @pp: requested pause configuration
+ *
+ * Description: Test if the PHY/MAC combination supports the Pause
+ * configuration the user is requesting. Returns True if it is
+ * supported, false otherwise.
+ */
+bool phy_validate_pause(struct phy_device *phydev,
+			struct ethtool_pauseparam *pp)
+{
+	if (!(phydev->supported & SUPPORTED_Pause) ||
+	    (!(phydev->supported & SUPPORTED_Asym_Pause) &&
+	     pp->rx_pause != pp->tx_pause))
+		return false;
+	return true;
+}
+EXPORT_SYMBOL(phy_validate_pause);
+
 static void of_set_phy_supported(struct phy_device *phydev)
 {
 	struct device_node *node = phydev->mdio.dev.of_node;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 8521391ebb20..192a1fa0c73b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1055,6 +1055,8 @@ void phy_support_asym_pause(struct phy_device *phydev);
 void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx,
 		       bool autoneg);
 void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx);
+bool phy_validate_pause(struct phy_device *phydev,
+			struct ethtool_pauseparam *pp);
 
 int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask,
 		       int (*run)(struct phy_device *));
-- 
cgit v1.2.3


From 435f2e7cc0b783615d7fbcf08f5f00d289f9caeb Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 11 Sep 2018 09:39:53 +0300
Subject: net: bridge: add support for sticky fdb entries

Add support for entries which are "sticky", i.e. will not change their port
if they show up from a different one. A new ndm flag is introduced for that
purpose - NTF_STICKY. We allow to set it only to non-local entries.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/neighbour.h |  1 +
 net/bridge/br_fdb.c            | 19 ++++++++++++++++---
 net/bridge/br_private.h        |  1 +
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index 904db6148476..998155444e0d 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -43,6 +43,7 @@ enum {
 #define NTF_PROXY	0x08	/* == ATF_PUBL */
 #define NTF_EXT_LEARNED	0x10
 #define NTF_OFFLOADED   0x20
+#define NTF_STICKY	0x40
 #define NTF_ROUTER	0x80
 
 /*
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 502f66349530..a56ed7f2a3a3 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -584,7 +584,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 			unsigned long now = jiffies;
 
 			/* fastpath: update of existing entry */
-			if (unlikely(source != fdb->dst)) {
+			if (unlikely(source != fdb->dst && !fdb->is_sticky)) {
 				fdb->dst = source;
 				fdb_modified = true;
 				/* Take over HW learned entry */
@@ -656,6 +656,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 		ndm->ndm_flags |= NTF_OFFLOADED;
 	if (fdb->added_by_external_learn)
 		ndm->ndm_flags |= NTF_EXT_LEARNED;
+	if (fdb->is_sticky)
+		ndm->ndm_flags |= NTF_STICKY;
 
 	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
 		goto nla_put_failure;
@@ -772,8 +774,10 @@ skip:
 
 /* Update (create or replace) forwarding database entry */
 static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
-			 const __u8 *addr, __u16 state, __u16 flags, __u16 vid)
+			 const u8 *addr, u16 state, u16 flags, u16 vid,
+			 u8 ndm_flags)
 {
+	u8 is_sticky = !!(ndm_flags & NTF_STICKY);
 	struct net_bridge_fdb_entry *fdb;
 	bool modified = false;
 
@@ -789,6 +793,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 		return -EINVAL;
 	}
 
+	if (is_sticky && (state & NUD_PERMANENT))
+		return -EINVAL;
+
 	fdb = br_fdb_find(br, addr, vid);
 	if (fdb == NULL) {
 		if (!(flags & NLM_F_CREATE))
@@ -832,6 +839,12 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 
 		modified = true;
 	}
+
+	if (is_sticky != fdb->is_sticky) {
+		fdb->is_sticky = is_sticky;
+		modified = true;
+	}
+
 	fdb->added_by_user = 1;
 
 	fdb->used = jiffies;
@@ -865,7 +878,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
 	} else {
 		spin_lock_bh(&br->hash_lock);
 		err = fdb_add_entry(br, p, addr, ndm->ndm_state,
-				    nlh_flags, vid);
+				    nlh_flags, vid, ndm->ndm_flags);
 		spin_unlock_bh(&br->hash_lock);
 	}
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 11ed2029985f..d21035a17f4c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -181,6 +181,7 @@ struct net_bridge_fdb_entry {
 	struct hlist_node		fdb_node;
 	unsigned char			is_local:1,
 					is_static:1,
+					is_sticky:1,
 					added_by_user:1,
 					added_by_external_learn:1,
 					offloaded:1;
-- 
cgit v1.2.3


From 52d0d404d39dd9eac71a181615d6ca15e23d8e38 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 12 Sep 2018 10:04:21 +0800
Subject: geneve: add ttl inherit support

Similar with commit 72f6d71e491e6 ("vxlan: add ttl inherit support"),
currently ttl == 0 means "use whatever default value" on geneve instead
of inherit inner ttl. To respect compatibility with old behavior, let's
add a new IFLA_GENEVE_TTL_INHERIT for geneve ttl inherit support.

Reported-by: Jianlin Shi <jishi@redhat.com>
Suggested-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c               | 41 ++++++++++++++++++++++++++++++--------
 include/uapi/linux/if_link.h       |  1 +
 tools/include/uapi/linux/if_link.h |  1 +
 3 files changed, 35 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 6acb6b5718b9..6625fabe2c88 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -69,6 +69,7 @@ struct geneve_dev {
 	struct gro_cells   gro_cells;
 	bool		   collect_md;
 	bool		   use_udp6_rx_checksums;
+	bool		   ttl_inherit;
 };
 
 struct geneve_sock {
@@ -843,7 +844,11 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		ttl = key->ttl;
 	} else {
 		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb);
-		ttl = key->ttl ? : ip4_dst_hoplimit(&rt->dst);
+		if (geneve->ttl_inherit)
+			ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb);
+		else
+			ttl = key->ttl;
+		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
 	}
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
 
@@ -889,7 +894,11 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	} else {
 		prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel),
 					   ip_hdr(skb), skb);
-		ttl = key->ttl ? : ip6_dst_hoplimit(dst);
+		if (geneve->ttl_inherit)
+			ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb);
+		else
+			ttl = key->ttl;
+		ttl = ttl ? : ip6_dst_hoplimit(dst);
 	}
 	err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr));
 	if (unlikely(err))
@@ -1091,6 +1100,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_UDP_CSUM]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
 	[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
+	[IFLA_GENEVE_TTL_INHERIT]	= { .type = NLA_U8 },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -1170,7 +1180,8 @@ static bool geneve_dst_addr_equal(struct ip_tunnel_info *a,
 static int geneve_configure(struct net *net, struct net_device *dev,
 			    struct netlink_ext_ack *extack,
 			    const struct ip_tunnel_info *info,
-			    bool metadata, bool ipv6_rx_csum)
+			    bool metadata, bool ipv6_rx_csum,
+			    bool ttl_inherit)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *t, *geneve = netdev_priv(dev);
@@ -1219,6 +1230,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	geneve->info = *info;
 	geneve->collect_md = metadata;
 	geneve->use_udp6_rx_checksums = ipv6_rx_csum;
+	geneve->ttl_inherit = ttl_inherit;
 
 	err = register_netdevice(dev);
 	if (err)
@@ -1237,7 +1249,8 @@ static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port)
 static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack,
 			  struct ip_tunnel_info *info, bool *metadata,
-			  bool *use_udp6_rx_checksums, bool changelink)
+			  bool *use_udp6_rx_checksums, bool *ttl_inherit,
+			  bool changelink)
 {
 	int attrtype;
 
@@ -1315,6 +1328,9 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 	if (data[IFLA_GENEVE_TTL])
 		info->key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
 
+	if (data[IFLA_GENEVE_TTL_INHERIT])
+		*ttl_inherit = true;
+
 	if (data[IFLA_GENEVE_TOS])
 		info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
@@ -1438,17 +1454,18 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 {
 	bool use_udp6_rx_checksums = false;
 	struct ip_tunnel_info info;
+	bool ttl_inherit = false;
 	bool metadata = false;
 	int err;
 
 	init_tnl_info(&info, GENEVE_UDP_PORT);
 	err = geneve_nl2info(tb, data, extack, &info, &metadata,
-			     &use_udp6_rx_checksums, false);
+			     &use_udp6_rx_checksums, &ttl_inherit, false);
 	if (err)
 		return err;
 
 	err = geneve_configure(net, dev, extack, &info, metadata,
-			       use_udp6_rx_checksums);
+			       use_udp6_rx_checksums, ttl_inherit);
 	if (err)
 		return err;
 
@@ -1511,6 +1528,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct ip_tunnel_info info;
 	bool metadata;
 	bool use_udp6_rx_checksums;
+	bool ttl_inherit;
 	int err;
 
 	/* If the geneve device is configured for metadata (or externally
@@ -1523,8 +1541,9 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	memcpy(&info, &geneve->info, sizeof(info));
 	metadata = geneve->collect_md;
 	use_udp6_rx_checksums = geneve->use_udp6_rx_checksums;
+	ttl_inherit = geneve->ttl_inherit;
 	err = geneve_nl2info(tb, data, extack, &info, &metadata,
-			     &use_udp6_rx_checksums, true);
+			     &use_udp6_rx_checksums, &ttl_inherit, true);
 	if (err)
 		return err;
 
@@ -1537,6 +1556,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	geneve->info = info;
 	geneve->collect_md = metadata;
 	geneve->use_udp6_rx_checksums = use_udp6_rx_checksums;
+	geneve->ttl_inherit = ttl_inherit;
 	geneve_unquiesce(geneve, gs4, gs6);
 
 	return 0;
@@ -1562,6 +1582,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
 		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */
 		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */
+		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */
 		0;
 }
 
@@ -1569,6 +1590,7 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct ip_tunnel_info *info = &geneve->info;
+	bool ttl_inherit = geneve->ttl_inherit;
 	bool metadata = geneve->collect_md;
 	__u8 tmp_vni[3];
 	__u32 vni;
@@ -1614,6 +1636,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		goto nla_put_failure;
 #endif
 
+	if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -1650,7 +1675,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 		return dev;
 
 	init_tnl_info(&info, dst_port);
-	err = geneve_configure(net, dev, NULL, &info, true, true);
+	err = geneve_configure(net, dev, NULL, &info, true, true, false);
 	if (err) {
 		free_netdev(dev);
 		return ERR_PTR(err);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 29d49b989acd..58faab897201 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -555,6 +555,7 @@ enum {
 	IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
 	IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
 	IFLA_GENEVE_LABEL,
+	IFLA_GENEVE_TTL_INHERIT,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 1c73d63068b1..141cbfdc5865 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -542,6 +542,7 @@ enum {
 	IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
 	IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
 	IFLA_GENEVE_LABEL,
+	IFLA_GENEVE_TTL_INHERIT,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From 7969119293f5aa3b51040ae81a80e87c7b979b2d Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Sun, 9 Sep 2018 22:16:43 +0200
Subject: net: dsa: Add Lantiq / Intel GSWIP tag support

This handles the tag added by the PMAC on the VRX200 SoC line.

The GSWIP uses internally a GSWIP special tag which is located after the
Ethernet header. The PMAC which connects the GSWIP to the CPU converts
this special tag used by the GSWIP into the PMAC special tag which is
added in front of the Ethernet header.

This was tested with GSWIP 2.1 found in the VRX200 SoCs, other GSWIP
versions use slightly different PMAC special tags.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS         |   6 +++
 include/net/dsa.h   |   1 +
 net/dsa/Kconfig     |   3 ++
 net/dsa/Makefile    |   1 +
 net/dsa/dsa.c       |   3 ++
 net/dsa/dsa_priv.h  |   3 ++
 net/dsa/tag_gswip.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 126 insertions(+)
 create mode 100644 net/dsa/tag_gswip.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 0b2e27e8abef..a11fbbe2be73 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8167,6 +8167,12 @@ S:	Maintained
 F:	net/l3mdev
 F:	include/net/l3mdev.h
 
+LANTIQ / INTEL Ethernet drivers
+M:	Hauke Mehrtens <hauke@hauke-m.de>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	net/dsa/tag_gswip.c
+
 LANTIQ MIPS ARCHITECTURE
 M:	John Crispin <john@phrozen.org>
 L:	linux-mips@linux-mips.org
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 461e8a7661b7..23690c44e167 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -35,6 +35,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_BRCM_PREPEND,
 	DSA_TAG_PROTO_DSA,
 	DSA_TAG_PROTO_EDSA,
+	DSA_TAG_PROTO_GSWIP,
 	DSA_TAG_PROTO_KSZ,
 	DSA_TAG_PROTO_LAN9303,
 	DSA_TAG_PROTO_MTK,
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 4183e4ba27a5..48c41918fb35 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -38,6 +38,9 @@ config NET_DSA_TAG_DSA
 config NET_DSA_TAG_EDSA
 	bool
 
+config NET_DSA_TAG_GSWIP
+	bool
+
 config NET_DSA_TAG_KSZ
 	bool
 
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 9e4d3536f977..6e721f7a2947 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -9,6 +9,7 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM_PREPEND) += tag_brcm.o
 dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
 dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
+dsa_core-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o
 dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
 dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
 dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 45f70859f550..5f73e96cc9e6 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -52,6 +52,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
 #ifdef CONFIG_NET_DSA_TAG_EDSA
 	[DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops,
 #endif
+#ifdef CONFIG_NET_DSA_TAG_GSWIP
+	[DSA_TAG_PROTO_GSWIP] = &gswip_netdev_ops,
+#endif
 #ifdef CONFIG_NET_DSA_TAG_KSZ
 	[DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops,
 #endif
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 2868b5bb7e7d..9e4fd04ab53c 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -206,6 +206,9 @@ extern const struct dsa_device_ops dsa_netdev_ops;
 /* tag_edsa.c */
 extern const struct dsa_device_ops edsa_netdev_ops;
 
+/* tag_gswip.c */
+extern const struct dsa_device_ops gswip_netdev_ops;
+
 /* tag_ksz.c */
 extern const struct dsa_device_ops ksz_netdev_ops;
 
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
new file mode 100644
index 000000000000..49e9b73f1be3
--- /dev/null
+++ b/net/dsa/tag_gswip.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel / Lantiq GSWIP V2.0 PMAC tag support
+ *
+ * Copyright (C) 2017 - 2018 Hauke Mehrtens <hauke@hauke-m.de>
+ */
+
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+
+#include "dsa_priv.h"
+
+#define GSWIP_TX_HEADER_LEN		4
+
+/* special tag in TX path header */
+/* Byte 0 */
+#define GSWIP_TX_SLPID_SHIFT		0	/* source port ID */
+#define  GSWIP_TX_SLPID_CPU		2
+#define  GSWIP_TX_SLPID_APP1		3
+#define  GSWIP_TX_SLPID_APP2		4
+#define  GSWIP_TX_SLPID_APP3		5
+#define  GSWIP_TX_SLPID_APP4		6
+#define  GSWIP_TX_SLPID_APP5		7
+
+/* Byte 1 */
+#define GSWIP_TX_CRCGEN_DIS		BIT(7)
+#define GSWIP_TX_DPID_SHIFT		0	/* destination group ID */
+#define  GSWIP_TX_DPID_ELAN		0
+#define  GSWIP_TX_DPID_EWAN		1
+#define  GSWIP_TX_DPID_CPU		2
+#define  GSWIP_TX_DPID_APP1		3
+#define  GSWIP_TX_DPID_APP2		4
+#define  GSWIP_TX_DPID_APP3		5
+#define  GSWIP_TX_DPID_APP4		6
+#define  GSWIP_TX_DPID_APP5		7
+
+/* Byte 2 */
+#define GSWIP_TX_PORT_MAP_EN		BIT(7)
+#define GSWIP_TX_PORT_MAP_SEL		BIT(6)
+#define GSWIP_TX_LRN_DIS		BIT(5)
+#define GSWIP_TX_CLASS_EN		BIT(4)
+#define GSWIP_TX_CLASS_SHIFT		0
+#define GSWIP_TX_CLASS_MASK		GENMASK(3, 0)
+
+/* Byte 3 */
+#define GSWIP_TX_DPID_EN		BIT(0)
+#define GSWIP_TX_PORT_MAP_SHIFT		1
+#define GSWIP_TX_PORT_MAP_MASK		GENMASK(6, 1)
+
+#define GSWIP_RX_HEADER_LEN	8
+
+/* special tag in RX path header */
+/* Byte 7 */
+#define GSWIP_RX_SPPID_SHIFT		4
+#define GSWIP_RX_SPPID_MASK		GENMASK(6, 4)
+
+static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb,
+				      struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	int err;
+	u8 *gswip_tag;
+
+	err = skb_cow_head(skb, GSWIP_TX_HEADER_LEN);
+	if (err)
+		return NULL;
+
+	skb_push(skb, GSWIP_TX_HEADER_LEN);
+
+	gswip_tag = skb->data;
+	gswip_tag[0] = GSWIP_TX_SLPID_CPU;
+	gswip_tag[1] = GSWIP_TX_DPID_ELAN;
+	gswip_tag[2] = GSWIP_TX_PORT_MAP_EN | GSWIP_TX_PORT_MAP_SEL;
+	gswip_tag[3] = BIT(dp->index + GSWIP_TX_PORT_MAP_SHIFT) & GSWIP_TX_PORT_MAP_MASK;
+	gswip_tag[3] |= GSWIP_TX_DPID_EN;
+
+	return skb;
+}
+
+static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
+				     struct net_device *dev,
+				     struct packet_type *pt)
+{
+	int port;
+	u8 *gswip_tag;
+
+	if (unlikely(!pskb_may_pull(skb, GSWIP_RX_HEADER_LEN)))
+		return NULL;
+
+	gswip_tag = skb->data - ETH_HLEN;
+
+	/* Get source port information */
+	port = (gswip_tag[7] & GSWIP_RX_SPPID_MASK) >> GSWIP_RX_SPPID_SHIFT;
+	skb->dev = dsa_master_find_slave(dev, 0, port);
+	if (!skb->dev)
+		return NULL;
+
+	/* remove GSWIP tag */
+	skb_pull_rcsum(skb, GSWIP_RX_HEADER_LEN);
+
+	return skb;
+}
+
+const struct dsa_device_ops gswip_netdev_ops = {
+	.xmit = gswip_tag_xmit,
+	.rcv = gswip_tag_rcv,
+};
-- 
cgit v1.2.3


From 15033f0457dca569b284bef0c8d3ad55fb37eacb Mon Sep 17 00:00:00 2001
From: Andre Naujoks <nautsch2@gmail.com>
Date: Mon, 10 Sep 2018 10:27:15 +0200
Subject: ipv6: Add sockopt IPV6_MULTICAST_ALL analogue to IP_MULTICAST_ALL

The socket option will be enabled by default to ensure current behaviour
is not changed. This is the same for the IPv4 version.

A socket bound to in6addr_any and a specific port will receive all traffic
on that port. Analogue to IP_MULTICAST_ALL, disable this behaviour, if
one or more multicast groups were joined (using said socket) and only
pass on multicast traffic from groups, which were explicitly joined via
this socket.

Without this option disabled a socket (system even) joined to multiple
multicast groups is very hard to get right. Filtering by destination
address has to take place in user space to avoid receiving multicast
traffic from other multicast groups, which might have traffic on the same
port.

The extension of the IP_MULTICAST_ALL socketoption to just apply to ipv6,
too, is not done to avoid changing the behaviour of current applications.

Signed-off-by: Andre Naujoks <nautsch2@gmail.com>
Acked-By: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  3 ++-
 include/uapi/linux/in6.h |  1 +
 net/ipv6/af_inet6.c      |  1 +
 net/ipv6/ipv6_sockglue.c | 11 +++++++++++
 net/ipv6/mcast.c         |  2 +-
 5 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 8415bf1a9776..495e834c1367 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -274,7 +274,8 @@ struct ipv6_pinfo {
 						 */
 				dontfrag:1,
 				autoflowlabel:1,
-				autoflowlabel_set:1;
+				autoflowlabel_set:1,
+				mc_all:1;
 	__u8			min_hopcount;
 	__u8			tclass;
 	__be32			rcv_flowinfo;
diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index ed291e55f024..71d82fe15b03 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -177,6 +177,7 @@ struct in6_flowlabel_req {
 #define IPV6_V6ONLY		26
 #define IPV6_JOIN_ANYCAST	27
 #define IPV6_LEAVE_ANYCAST	28
+#define IPV6_MULTICAST_ALL	29
 
 /* IPV6_MTU_DISCOVER values */
 #define IPV6_PMTUDISC_DONT		0
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9a4261e50272..77ef8478234f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -209,6 +209,7 @@ lookup_protocol:
 	np->hop_limit	= -1;
 	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
 	np->mc_loop	= 1;
+	np->mc_all	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
 	np->repflow	= net->ipv6.sysctl.flowlabel_reflect;
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c0cac9cc3a28..381ce38940ae 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -674,6 +674,13 @@ done:
 			retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr);
 		break;
 	}
+	case IPV6_MULTICAST_ALL:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->mc_all = valbool;
+		retv = 0;
+		break;
+
 	case MCAST_JOIN_GROUP:
 	case MCAST_LEAVE_GROUP:
 	{
@@ -1266,6 +1273,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		val = np->mcast_oif;
 		break;
 
+	case IPV6_MULTICAST_ALL:
+		val = np->mc_all;
+		break;
+
 	case IPV6_UNICAST_IF:
 		val = (__force int)htonl((__u32) np->ucast_oif);
 		break;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 4ae54aaca373..6895e1dc0b03 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -636,7 +636,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
 	}
 	if (!mc) {
 		rcu_read_unlock();
-		return true;
+		return np->mc_all;
 	}
 	read_lock(&mc->sflock);
 	psl = mc->sflist;
-- 
cgit v1.2.3


From 9708d2b5b7c648e8e0a40d11e8cea12f6277f33c Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 11 Sep 2018 11:42:06 -0700
Subject: llc: avoid blocking in llc_sap_close()

llc_sap_close() is called by llc_sap_put() which
could be called in BH context in llc_rcv(). We can't
block in BH.

There is no reason to block it here, kfree_rcu() should
be sufficient.

Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/llc.h  | 1 +
 net/llc/llc_core.c | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/llc.h b/include/net/llc.h
index 890a87318014..df282d9b4017 100644
--- a/include/net/llc.h
+++ b/include/net/llc.h
@@ -66,6 +66,7 @@ struct llc_sap {
 	int sk_count;
 	struct hlist_nulls_head sk_laddr_hash[LLC_SK_LADDR_HASH_ENTRIES];
 	struct hlist_head sk_dev_hash[LLC_SK_DEV_HASH_ENTRIES];
+	struct rcu_head rcu;
 };
 
 static inline
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index 260b3dc1b4a2..64d4bef04e73 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -127,9 +127,7 @@ void llc_sap_close(struct llc_sap *sap)
 	list_del_rcu(&sap->node);
 	spin_unlock_bh(&llc_sap_list_lock);
 
-	synchronize_rcu();
-
-	kfree(sap);
+	kfree_rcu(sap, rcu);
 }
 
 static struct packet_type llc_packet_type __read_mostly = {
-- 
cgit v1.2.3


From e4a2a3048ed93f0c354ad837f1d45fc8d389d538 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 12 Sep 2018 11:16:59 +0800
Subject: net: sock: introduce SOCK_XDP

This patch introduces a new sock flag - SOCK_XDP. This will be used
for notifying the upper layer that XDP program is attached on the
lower socket, and requires for extra headroom.

TUN will be the first user.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c  | 19 +++++++++++++++++++
 include/net/sock.h |  1 +
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ebd07ad82431..2c548bd20393 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -869,6 +869,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 		tun_napi_init(tun, tfile, napi);
 	}
 
+	if (rtnl_dereference(tun->xdp_prog))
+		sock_set_flag(&tfile->sk, SOCK_XDP);
+
 	tun_set_real_num_queues(tun);
 
 	/* device is allowed to go away first, so no need to hold extra
@@ -1241,13 +1244,29 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 		       struct netlink_ext_ack *extack)
 {
 	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile;
 	struct bpf_prog *old_prog;
+	int i;
 
 	old_prog = rtnl_dereference(tun->xdp_prog);
 	rcu_assign_pointer(tun->xdp_prog, prog);
 	if (old_prog)
 		bpf_prog_put(old_prog);
 
+	for (i = 0; i < tun->numqueues; i++) {
+		tfile = rtnl_dereference(tun->tfiles[i]);
+		if (prog)
+			sock_set_flag(&tfile->sk, SOCK_XDP);
+		else
+			sock_reset_flag(&tfile->sk, SOCK_XDP);
+	}
+	list_for_each_entry(tfile, &tun->disabled, next) {
+		if (prog)
+			sock_set_flag(&tfile->sk, SOCK_XDP);
+		else
+			sock_reset_flag(&tfile->sk, SOCK_XDP);
+	}
+
 	return 0;
 }
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 433f45fc2d68..38cae35f6e16 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -800,6 +800,7 @@ enum sock_flags {
 	SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
 	SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
 	SOCK_TXTIME,
+	SOCK_XDP, /* XDP is attached */
 };
 
 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
-- 
cgit v1.2.3


From fe8dd45bb7556246c6b76277b1ba4296c91c2505 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 12 Sep 2018 11:17:06 +0800
Subject: tun: switch to new type of msg_control

This patch introduces to a new tun/tap specific msg_control:

#define TUN_MSG_UBUF 1
#define TUN_MSG_PTR  2
struct tun_msg_ctl {
       int type;
       void *ptr;
};

This allows us to pass different kinds of msg_control through
sendmsg(). The first supported type is ubuf (TUN_MSG_UBUF) which will
be used by the existed vhost_net zerocopy code. The second is XDP
buff, which allows vhost_net to pass XDP buff to TUN. This could be
used to implement accepting an array of XDP buffs from vhost_net in
the following patches.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tap.c      | 18 ++++++++++++------
 drivers/net/tun.c      |  6 +++++-
 drivers/vhost/net.c    |  7 +++++--
 include/linux/if_tun.h | 14 ++++++++++++++
 4 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index f0f7cd977667..7996ed7cbf18 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -619,7 +619,7 @@ static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad,
 #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN)
 
 /* Get packet from user space buffer */
-static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
+static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
 			    struct iov_iter *from, int noblock)
 {
 	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
@@ -663,7 +663,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 	if (unlikely(len < ETH_HLEN))
 		goto err;
 
-	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
+	if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
 		struct iov_iter i;
 
 		copylen = vnet_hdr.hdr_len ?
@@ -724,11 +724,11 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 	tap = rcu_dereference(q->tap);
 	/* copy skb_ubuf_info for callback when skb has no error */
 	if (zerocopy) {
-		skb_shinfo(skb)->destructor_arg = m->msg_control;
+		skb_shinfo(skb)->destructor_arg = msg_control;
 		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
-	} else if (m && m->msg_control) {
-		struct ubuf_info *uarg = m->msg_control;
+	} else if (msg_control) {
+		struct ubuf_info *uarg = msg_control;
 		uarg->callback(uarg, false);
 	}
 
@@ -1150,7 +1150,13 @@ static int tap_sendmsg(struct socket *sock, struct msghdr *m,
 		       size_t total_len)
 {
 	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
-	return tap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
+	struct tun_msg_ctl *ctl = m->msg_control;
+
+	if (ctl && ctl->type != TUN_MSG_UBUF)
+		return -EINVAL;
+
+	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
+			    m->msg_flags & MSG_DONTWAIT);
 }
 
 static int tap_recvmsg(struct socket *sock, struct msghdr *m,
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3ae539374f6b..89779b58c7ca 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2431,11 +2431,15 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 	int ret;
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = tun_get(tfile);
+	struct tun_msg_ctl *ctl = m->msg_control;
 
 	if (!tun)
 		return -EBADFD;
 
-	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
+	if (ctl && ctl->type != TUN_MSG_UBUF)
+		return -EINVAL;
+
+	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
 			   m->msg_flags & MSG_DONTWAIT,
 			   m->msg_flags & MSG_MORE);
 	tun_put(tun);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 4e656f89cb22..fb01ce6d981c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -620,6 +620,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 		.msg_controllen = 0,
 		.msg_flags = MSG_DONTWAIT,
 	};
+	struct tun_msg_ctl ctl;
 	size_t len, total_len = 0;
 	int err;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
@@ -664,8 +665,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
 			refcount_set(&ubuf->refcnt, 1);
-			msg.msg_control = ubuf;
-			msg.msg_controllen = sizeof(ubuf);
+			msg.msg_control = &ctl;
+			ctl.type = TUN_MSG_UBUF;
+			ctl.ptr = ubuf;
+			msg.msg_controllen = sizeof(ctl);
 			ubufs = nvq->ubufs;
 			atomic_inc(&ubufs->refcount);
 			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 3d2996dc7d85..12e3eebf0ce6 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -16,9 +16,23 @@
 #define __IF_TUN_H
 
 #include <uapi/linux/if_tun.h>
+#include <uapi/linux/virtio_net.h>
 
 #define TUN_XDP_FLAG 0x1UL
 
+#define TUN_MSG_UBUF 1
+#define TUN_MSG_PTR  2
+struct tun_msg_ctl {
+	unsigned short type;
+	unsigned short num;
+	void *ptr;
+};
+
+struct tun_xdp_hdr {
+	int buflen;
+	struct virtio_net_hdr gso;
+};
+
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
-- 
cgit v1.2.3


From 293681f149a8dc4c9df2c09b2c4e873d474be5d4 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 13 Sep 2018 21:32:23 +0800
Subject: vxlan: Remove duplicated include from vxlan.h

Remove duplicated include.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vxlan.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index b99a02ae3934..7ef15179f263 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -5,7 +5,6 @@
 #include <linux/if_vlan.h>
 #include <net/udp_tunnel.h>
 #include <net/dst_metadata.h>
-#include <net/udp_tunnel.h>
 
 /* VXLAN protocol (RFC 7348) header:
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-- 
cgit v1.2.3


From 52bb6677d530d37055092d86b4eab69dce6c166a Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Fri, 14 Sep 2018 16:00:51 +0800
Subject: net: move definition of pcpu_lstats to header file

pcpu_lstats is defined in several files, so unify them as one
and move to header file

Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/loopback.c    |  6 ------
 drivers/net/nlmon.c       |  6 ------
 drivers/net/vsockmon.c    | 14 ++++----------
 include/linux/netdevice.h |  6 ++++++
 4 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 30612497643c..a7207fa7e451 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -59,12 +59,6 @@
 #include <net/net_namespace.h>
 #include <linux/u64_stats_sync.h>
 
-struct pcpu_lstats {
-	u64			packets;
-	u64			bytes;
-	struct u64_stats_sync	syncp;
-};
-
 /* The higher levels take care of making this non-reentrant (it's
  * called with bh's disabled).
  */
diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c
index 4b22955de191..dd0db7534cb3 100644
--- a/drivers/net/nlmon.c
+++ b/drivers/net/nlmon.c
@@ -6,12 +6,6 @@
 #include <linux/if_arp.h>
 #include <net/rtnetlink.h>
 
-struct pcpu_lstats {
-	u64 packets;
-	u64 bytes;
-	struct u64_stats_sync syncp;
-};
-
 static netdev_tx_t nlmon_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	int len = skb->len;
diff --git a/drivers/net/vsockmon.c b/drivers/net/vsockmon.c
index c28bdce14fd5..7bad5c95551f 100644
--- a/drivers/net/vsockmon.c
+++ b/drivers/net/vsockmon.c
@@ -11,12 +11,6 @@
 #define DEFAULT_MTU (VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + \
 		     sizeof(struct af_vsockmon_hdr))
 
-struct pcpu_lstats {
-	u64 rx_packets;
-	u64 rx_bytes;
-	struct u64_stats_sync syncp;
-};
-
 static int vsockmon_dev_init(struct net_device *dev)
 {
 	dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
@@ -56,8 +50,8 @@ static netdev_tx_t vsockmon_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats);
 
 	u64_stats_update_begin(&stats->syncp);
-	stats->rx_bytes += len;
-	stats->rx_packets++;
+	stats->bytes += len;
+	stats->packets++;
 	u64_stats_update_end(&stats->syncp);
 
 	dev_kfree_skb(skb);
@@ -80,8 +74,8 @@ vsockmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 
 		do {
 			start = u64_stats_fetch_begin_irq(&vstats->syncp);
-			tbytes = vstats->rx_bytes;
-			tpackets = vstats->rx_packets;
+			tbytes = vstats->bytes;
+			tpackets = vstats->packets;
 		} while (u64_stats_fetch_retry_irq(&vstats->syncp, start));
 
 		packets += tpackets;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e2b3bd750c98..baed5d5088c5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2382,6 +2382,12 @@ struct pcpu_sw_netstats {
 	struct u64_stats_sync   syncp;
 };
 
+struct pcpu_lstats {
+	u64 packets;
+	u64 bytes;
+	struct u64_stats_sync syncp;
+};
+
 #define __netdev_alloc_pcpu_stats(type, gfp)				\
 ({									\
 	typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\
-- 
cgit v1.2.3


From d58e468b1112dcd1d5193c0a89ff9f98b5a3e8b9 Mon Sep 17 00:00:00 2001
From: Petar Penkov <ppenkov@google.com>
Date: Fri, 14 Sep 2018 07:46:18 -0700
Subject: flow_dissector: implements flow dissector BPF hook

Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and
attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector
path. The BPF program is per-network namespace.

Signed-off-by: Petar Penkov <ppenkov@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h         |   1 +
 include/linux/bpf_types.h   |   1 +
 include/linux/skbuff.h      |   7 +++
 include/net/net_namespace.h |   3 +
 include/net/sch_generic.h   |  12 +++-
 include/uapi/linux/bpf.h    |  26 +++++++++
 kernel/bpf/syscall.c        |   8 +++
 kernel/bpf/verifier.c       |  32 +++++++++++
 net/core/filter.c           |  70 +++++++++++++++++++++++
 net/core/flow_dissector.c   | 134 ++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 291 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 523481a3471b..988a00797bcd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -212,6 +212,7 @@ enum bpf_reg_type {
 	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
+	PTR_TO_FLOW_KEYS,	 /* reg points to bpf_flow_keys */
 };
 
 /* The information passed from prog-specific *_is_valid_access
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index cd26c090e7c0..22083712dd18 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -32,6 +32,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
 #ifdef CONFIG_INET
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
 #endif
+BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector)
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 17a13e4785fc..ce0e863f02a2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -243,6 +243,8 @@ struct scatterlist;
 struct pipe_inode_info;
 struct iov_iter;
 struct napi_struct;
+struct bpf_prog;
+union bpf_attr;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -1192,6 +1194,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     const struct flow_dissector_key *key,
 			     unsigned int key_count);
 
+int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
+				       struct bpf_prog *prog);
+
+int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
+
 bool __skb_flow_dissect(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container,
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 9b5fdc50519a..99d4148e0f90 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -43,6 +43,7 @@ struct ctl_table_header;
 struct net_generic;
 struct uevent_sock;
 struct netns_ipvs;
+struct bpf_prog;
 
 
 #define NETDEV_HASHBITS    8
@@ -145,6 +146,8 @@ struct net {
 #endif
 	struct net_generic __rcu	*gen;
 
+	struct bpf_prog __rcu	*flow_dissector_prog;
+
 	/* Note : following structs are cache line aligned */
 #ifdef CONFIG_XFRM
 	struct netns_xfrm	xfrm;
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a6d00093f35e..1b81ba85fd2d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -19,6 +19,7 @@ struct Qdisc_ops;
 struct qdisc_walker;
 struct tcf_walker;
 struct module;
+struct bpf_flow_keys;
 
 typedef int tc_setup_cb_t(enum tc_setup_type type,
 			  void *type_data, void *cb_priv);
@@ -307,9 +308,14 @@ struct tcf_proto {
 };
 
 struct qdisc_skb_cb {
-	unsigned int		pkt_len;
-	u16			slave_dev_queue_mapping;
-	u16			tc_classid;
+	union {
+		struct {
+			unsigned int		pkt_len;
+			u16			slave_dev_queue_mapping;
+			u16			tc_classid;
+		};
+		struct bpf_flow_keys *flow_keys;
+	};
 #define QDISC_CB_PRIV_LEN 20
 	unsigned char		data[QDISC_CB_PRIV_LEN];
 };
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 66917a4eba27..aa5ccd2385ed 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -152,6 +152,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 	BPF_PROG_TYPE_LIRC_MODE2,
 	BPF_PROG_TYPE_SK_REUSEPORT,
+	BPF_PROG_TYPE_FLOW_DISSECTOR,
 };
 
 enum bpf_attach_type {
@@ -172,6 +173,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UDP4_SENDMSG,
 	BPF_CGROUP_UDP6_SENDMSG,
 	BPF_LIRC_MODE2,
+	BPF_FLOW_DISSECTOR,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2333,6 +2335,7 @@ struct __sk_buff {
 	/* ... here. */
 
 	__u32 data_meta;
+	struct bpf_flow_keys *flow_keys;
 };
 
 struct bpf_tunnel_key {
@@ -2778,4 +2781,27 @@ enum bpf_task_fd_type {
 	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
 };
 
+struct bpf_flow_keys {
+	__u16	nhoff;
+	__u16	thoff;
+	__u16	addr_proto;			/* ETH_P_* of valid addrs */
+	__u8	is_frag;
+	__u8	is_first_frag;
+	__u8	is_encap;
+	__u8	ip_proto;
+	__be16	n_proto;
+	__be16	sport;
+	__be16	dport;
+	union {
+		struct {
+			__be32	ipv4_src;
+			__be32	ipv4_dst;
+		};
+		struct {
+			__u32	ipv6_src[4];	/* in6_addr; network order */
+			__u32	ipv6_dst[4];	/* in6_addr; network order */
+		};
+	};
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3c9636f03bb2..b3c2d09bcf7a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1615,6 +1615,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_LIRC_MODE2:
 		ptype = BPF_PROG_TYPE_LIRC_MODE2;
 		break;
+	case BPF_FLOW_DISSECTOR:
+		ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1636,6 +1639,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_LIRC_MODE2:
 		ret = lirc_prog_attach(attr, prog);
 		break;
+	case BPF_PROG_TYPE_FLOW_DISSECTOR:
+		ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
+		break;
 	default:
 		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
 	}
@@ -1688,6 +1694,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_detach(attr);
+	case BPF_FLOW_DISSECTOR:
+		return skb_flow_dissector_bpf_prog_detach(attr);
 	default:
 		return -EINVAL;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6ff1bac1795d..8ccbff4fff93 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -261,6 +261,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET]		= "pkt",
 	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
+	[PTR_TO_FLOW_KEYS]	= "flow_keys",
 };
 
 static char slot_type_char[] = {
@@ -965,6 +966,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_PACKET:
 	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
+	case PTR_TO_FLOW_KEYS:
 	case CONST_PTR_TO_MAP:
 		return true;
 	default:
@@ -1238,6 +1240,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	case BPF_PROG_TYPE_LWT_XMIT:
 	case BPF_PROG_TYPE_SK_SKB:
 	case BPF_PROG_TYPE_SK_MSG:
+	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 		if (meta)
 			return meta->pkt_access;
 
@@ -1321,6 +1324,18 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 	return -EACCES;
 }
 
+static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
+				  int size)
+{
+	if (size < 0 || off < 0 ||
+	    (u64)off + size > sizeof(struct bpf_flow_keys)) {
+		verbose(env, "invalid access to flow keys off=%d size=%d\n",
+			off, size);
+		return -EACCES;
+	}
+	return 0;
+}
+
 static bool __is_pointer_value(bool allow_ptr_leaks,
 			       const struct bpf_reg_state *reg)
 {
@@ -1422,6 +1437,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 		 * right in front, treat it the very same way.
 		 */
 		return check_pkt_ptr_alignment(env, reg, off, size, strict);
+	case PTR_TO_FLOW_KEYS:
+		pointer_desc = "flow keys ";
+		break;
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
 		break;
@@ -1692,6 +1710,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_packet_access(env, regno, off, size, false);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
+	} else if (reg->type == PTR_TO_FLOW_KEYS) {
+		if (t == BPF_WRITE && value_regno >= 0 &&
+		    is_pointer_value(env, value_regno)) {
+			verbose(env, "R%d leaks addr into flow keys\n",
+				value_regno);
+			return -EACCES;
+		}
+
+		err = check_flow_keys_access(env, off, size);
+		if (!err && t == BPF_READ && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -1839,6 +1868,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size,
 					   zero_size_allowed);
+	case PTR_TO_FLOW_KEYS:
+		return check_flow_keys_access(env, reg->off, access_size);
 	case PTR_TO_MAP_VALUE:
 		return check_map_access(env, regno, reg->off, access_size,
 					zero_size_allowed);
@@ -4366,6 +4397,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_CTX:
 	case CONST_PTR_TO_MAP:
 	case PTR_TO_PACKET_END:
+	case PTR_TO_FLOW_KEYS:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
diff --git a/net/core/filter.c b/net/core/filter.c
index bf5b6efd369a..9cc76f134ddb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5123,6 +5123,17 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
 static const struct bpf_func_proto *
 lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -5241,6 +5252,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 		if (size != size_default)
 			return false;
 		break;
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
+		if (size != sizeof(struct bpf_flow_keys *))
+			return false;
+		break;
 	default:
 		/* Only narrow read access allowed for now. */
 		if (type == BPF_WRITE) {
@@ -5266,6 +5281,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
 	}
@@ -5291,6 +5307,7 @@ static bool lwt_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
 		return false;
 	}
 
@@ -5501,6 +5518,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
 	}
@@ -5702,6 +5720,7 @@ static bool sk_skb_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data_meta):
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
 		return false;
 	}
 
@@ -5761,6 +5780,39 @@ static bool sk_msg_is_valid_access(int off, int size,
 	return true;
 }
 
+static bool flow_dissector_is_valid_access(int off, int size,
+					   enum bpf_access_type type,
+					   const struct bpf_prog *prog,
+					   struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case bpf_ctx_range(struct __sk_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
+		info->reg_type = PTR_TO_FLOW_KEYS;
+		break;
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+		return false;
+	}
+
+	return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
@@ -6055,6 +6107,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				      bpf_target_off(struct sock_common,
 						     skc_num, 2, target_size));
 		break;
+
+	case offsetof(struct __sk_buff, flow_keys):
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, flow_keys);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct qdisc_skb_cb, flow_keys);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
+		break;
 	}
 
 	return insn - insn_buf;
@@ -7018,6 +7079,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = {
 const struct bpf_prog_ops sk_msg_prog_ops = {
 };
 
+const struct bpf_verifier_ops flow_dissector_verifier_ops = {
+	.get_func_proto		= flow_dissector_func_proto,
+	.is_valid_access	= flow_dissector_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops flow_dissector_prog_ops = {
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index ce9eeeb7c024..5c5dd74b5b3b 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -25,6 +25,9 @@
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 #include <uapi/linux/batadv_packet.h>
+#include <linux/bpf.h>
+
+static DEFINE_MUTEX(flow_dissector_mutex);
 
 static void dissector_set_key(struct flow_dissector *flow_dissector,
 			      enum flow_dissector_key_id key_id)
@@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 }
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
+int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
+				       struct bpf_prog *prog)
+{
+	struct bpf_prog *attached;
+	struct net *net;
+
+	net = current->nsproxy->net_ns;
+	mutex_lock(&flow_dissector_mutex);
+	attached = rcu_dereference_protected(net->flow_dissector_prog,
+					     lockdep_is_held(&flow_dissector_mutex));
+	if (attached) {
+		/* Only one BPF program can be attached at a time */
+		mutex_unlock(&flow_dissector_mutex);
+		return -EEXIST;
+	}
+	rcu_assign_pointer(net->flow_dissector_prog, prog);
+	mutex_unlock(&flow_dissector_mutex);
+	return 0;
+}
+
+int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
+{
+	struct bpf_prog *attached;
+	struct net *net;
+
+	net = current->nsproxy->net_ns;
+	mutex_lock(&flow_dissector_mutex);
+	attached = rcu_dereference_protected(net->flow_dissector_prog,
+					     lockdep_is_held(&flow_dissector_mutex));
+	if (!attached) {
+		mutex_unlock(&flow_dissector_mutex);
+		return -ENOENT;
+	}
+	bpf_prog_put(attached);
+	RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
+	mutex_unlock(&flow_dissector_mutex);
+	return 0;
+}
 /**
  * skb_flow_get_be16 - extract be16 entity
  * @skb: sk_buff to extract from
@@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
 	return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
 }
 
+static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
+				     struct flow_dissector *flow_dissector,
+				     void *target_container)
+{
+	struct flow_dissector_key_control *key_control;
+	struct flow_dissector_key_basic *key_basic;
+	struct flow_dissector_key_addrs *key_addrs;
+	struct flow_dissector_key_ports *key_ports;
+
+	key_control = skb_flow_dissector_target(flow_dissector,
+						FLOW_DISSECTOR_KEY_CONTROL,
+						target_container);
+	key_control->thoff = flow_keys->thoff;
+	if (flow_keys->is_frag)
+		key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+	if (flow_keys->is_first_frag)
+		key_control->flags |= FLOW_DIS_FIRST_FRAG;
+	if (flow_keys->is_encap)
+		key_control->flags |= FLOW_DIS_ENCAPSULATION;
+
+	key_basic = skb_flow_dissector_target(flow_dissector,
+					      FLOW_DISSECTOR_KEY_BASIC,
+					      target_container);
+	key_basic->n_proto = flow_keys->n_proto;
+	key_basic->ip_proto = flow_keys->ip_proto;
+
+	if (flow_keys->addr_proto == ETH_P_IP &&
+	    dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+		key_addrs = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						      target_container);
+		key_addrs->v4addrs.src = flow_keys->ipv4_src;
+		key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
+		key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+	} else if (flow_keys->addr_proto == ETH_P_IPV6 &&
+		   dissector_uses_key(flow_dissector,
+				      FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+		key_addrs = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						      target_container);
+		memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src,
+		       sizeof(key_addrs->v6addrs));
+		key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+	}
+
+	if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+		key_ports = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_PORTS,
+						      target_container);
+		key_ports->src = flow_keys->sport;
+		key_ports->dst = flow_keys->dport;
+	}
+}
+
 /**
  * __skb_flow_dissect - extract the flow_keys struct and return it
  * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
@@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_vlan *key_vlan;
 	enum flow_dissect_ret fdret;
 	enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
+	struct bpf_prog *attached;
 	int num_hdrs = 0;
 	u8 ip_proto = 0;
 	bool ret;
@@ -658,6 +754,44 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 					      FLOW_DISSECTOR_KEY_BASIC,
 					      target_container);
 
+	rcu_read_lock();
+	attached = skb ? rcu_dereference(dev_net(skb->dev)->flow_dissector_prog)
+		       : NULL;
+	if (attached) {
+		/* Note that even though the const qualifier is discarded
+		 * throughout the execution of the BPF program, all changes(the
+		 * control block) are reverted after the BPF program returns.
+		 * Therefore, __skb_flow_dissect does not alter the skb.
+		 */
+		struct bpf_flow_keys flow_keys = {};
+		struct bpf_skb_data_end cb_saved;
+		struct bpf_skb_data_end *cb;
+		u32 result;
+
+		cb = (struct bpf_skb_data_end *)skb->cb;
+
+		/* Save Control Block */
+		memcpy(&cb_saved, cb, sizeof(cb_saved));
+		memset(cb, 0, sizeof(cb_saved));
+
+		/* Pass parameters to the BPF program */
+		cb->qdisc_cb.flow_keys = &flow_keys;
+		flow_keys.nhoff = nhoff;
+
+		bpf_compute_data_pointers((struct sk_buff *)skb);
+		result = BPF_PROG_RUN(attached, skb);
+
+		/* Restore state */
+		memcpy(cb, &cb_saved, sizeof(cb_saved));
+
+		__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
+					 target_container);
+		key_control->thoff = min_t(u16, key_control->thoff, skb->len);
+		rcu_read_unlock();
+		return result == BPF_OK;
+	}
+	rcu_read_unlock();
+
 	if (dissector_uses_key(flow_dissector,
 			       FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
 		struct ethhdr *eth = eth_hdr(skb);
-- 
cgit v1.2.3


From 02b408fae3d5552d10d1189fc0bd7e5b1e76af71 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 29 Aug 2018 00:19:00 +0200
Subject: netfilter: nf_tables: rt: allow checking if dst has xfrm attached

Useful e.g. to avoid NATting inner headers of to-be-encrypted packets.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_rt.c                   | 11 +++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index e23290ffdc77..6c44cbbb2cda 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -826,12 +826,14 @@ enum nft_meta_keys {
  * @NFT_RT_NEXTHOP4: routing nexthop for IPv4
  * @NFT_RT_NEXTHOP6: routing nexthop for IPv6
  * @NFT_RT_TCPMSS: fetch current path tcp mss
+ * @NFT_RT_XFRM: boolean, skb->dst->xfrm != NULL
  */
 enum nft_rt_keys {
 	NFT_RT_CLASSID,
 	NFT_RT_NEXTHOP4,
 	NFT_RT_NEXTHOP6,
 	NFT_RT_TCPMSS,
+	NFT_RT_XFRM,
 	__NFT_RT_MAX
 };
 #define NFT_RT_MAX		(__NFT_RT_MAX - 1)
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 76dba9f6b6f6..f35fa33913ae 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -90,6 +90,11 @@ static void nft_rt_get_eval(const struct nft_expr *expr,
 	case NFT_RT_TCPMSS:
 		nft_reg_store16(dest, get_tcpmss(pkt, dst));
 		break;
+#ifdef CONFIG_XFRM
+	case NFT_RT_XFRM:
+		nft_reg_store8(dest, !!dst->xfrm);
+		break;
+#endif
 	default:
 		WARN_ON(1);
 		goto err;
@@ -130,6 +135,11 @@ static int nft_rt_get_init(const struct nft_ctx *ctx,
 	case NFT_RT_TCPMSS:
 		len = sizeof(u16);
 		break;
+#ifdef CONFIG_XFRM
+	case NFT_RT_XFRM:
+		len = sizeof(u8);
+		break;
+#endif
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -164,6 +174,7 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp
 	case NFT_RT_NEXTHOP4:
 	case NFT_RT_NEXTHOP6:
 	case NFT_RT_CLASSID:
+	case NFT_RT_XFRM:
 		return 0;
 	case NFT_RT_TCPMSS:
 		hooks = (1 << NF_INET_FORWARD) |
-- 
cgit v1.2.3


From cd5125d8f51882279f50506bb9c7e5e89dc9bef3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 29 Aug 2018 14:41:30 +0200
Subject: netfilter: nf_tables: split set destruction in deactivate and destroy
 phase

Splits unbind_set into destroy_set and unbinding operation.

Unbinding removes set from lists (so new transaction would not
find it anymore) but keeps memory allocated (so packet path continues
to work).

Rebind function is added to allow unrolling in case transaction
that wants to remove set is aborted.

Destroy function is added to free the memory, but this could occur
outside of transaction in the future.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  7 ++++++-
 net/netfilter/nf_tables_api.c     | 36 +++++++++++++++++++++++++-----------
 net/netfilter/nft_dynset.c        | 21 ++++++++++++++++++++-
 net/netfilter/nft_lookup.c        | 20 +++++++++++++++++++-
 net/netfilter/nft_objref.c        | 20 +++++++++++++++++++-
 5 files changed, 89 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 0f39ac487012..2c33958f3e7a 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -470,6 +470,9 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 		       struct nft_set_binding *binding);
 void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
 			  struct nft_set_binding *binding);
+void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set,
+			  struct nft_set_binding *binding);
+void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set);
 
 /**
  *	enum nft_set_extensions - set extension type IDs
@@ -724,7 +727,9 @@ struct nft_expr_type {
  *	@eval: Expression evaluation function
  *	@size: full expression size, including private data size
  *	@init: initialization function
- *	@destroy: destruction function
+ *	@activate: activate expression in the next generation
+ *	@deactivate: deactivate expression in next generation
+ *	@destroy: destruction function, called after synchronize_rcu
  *	@dump: function to dump parameters
  *	@type: expression type
  *	@validate: validate expression, called during loop detection
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2cfb173cd0b2..220e6aab3fac 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -298,7 +298,7 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx)
 	return 0;
 }
 
-static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
+static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
 			     struct nft_set *set)
 {
 	struct nft_trans *trans;
@@ -318,7 +318,7 @@ static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
 	return 0;
 }
 
-static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
+static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
 {
 	int err;
 
@@ -3567,13 +3567,6 @@ static void nft_set_destroy(struct nft_set *set)
 	kvfree(set);
 }
 
-static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
-{
-	list_del_rcu(&set->list);
-	nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
-	nft_set_destroy(set);
-}
-
 static int nf_tables_delset(struct net *net, struct sock *nlsk,
 			    struct sk_buff *skb, const struct nlmsghdr *nlh,
 			    const struct nlattr * const nla[],
@@ -3668,17 +3661,38 @@ bind:
 }
 EXPORT_SYMBOL_GPL(nf_tables_bind_set);
 
-void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
+void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set,
 			  struct nft_set_binding *binding)
+{
+	if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
+	    nft_is_active(ctx->net, set))
+		list_add_tail_rcu(&set->list, &ctx->table->sets);
+
+	list_add_tail_rcu(&binding->list, &set->bindings);
+}
+EXPORT_SYMBOL_GPL(nf_tables_rebind_set);
+
+void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
+		          struct nft_set_binding *binding)
 {
 	list_del_rcu(&binding->list);
 
 	if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
 	    nft_is_active(ctx->net, set))
-		nf_tables_set_destroy(ctx, set);
+		list_del_rcu(&set->list);
 }
 EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
 
+void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
+{
+	if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
+	    nft_is_active(ctx->net, set)) {
+		nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
+		nft_set_destroy(set);
+	}
+}
+EXPORT_SYMBOL_GPL(nf_tables_destroy_set);
+
 const struct nft_set_ext_type nft_set_ext_types[] = {
 	[NFT_SET_EXT_KEY]		= {
 		.align	= __alignof__(u32),
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 6e91a37d57f2..07d4efd3d851 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -235,14 +235,31 @@ err1:
 	return err;
 }
 
+static void nft_dynset_activate(const struct nft_ctx *ctx,
+				const struct nft_expr *expr)
+{
+	struct nft_dynset *priv = nft_expr_priv(expr);
+
+	nf_tables_rebind_set(ctx, priv->set, &priv->binding);
+}
+
+static void nft_dynset_deactivate(const struct nft_ctx *ctx,
+				  const struct nft_expr *expr)
+{
+	struct nft_dynset *priv = nft_expr_priv(expr);
+
+	nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
 static void nft_dynset_destroy(const struct nft_ctx *ctx,
 			       const struct nft_expr *expr)
 {
 	struct nft_dynset *priv = nft_expr_priv(expr);
 
-	nf_tables_unbind_set(ctx, priv->set, &priv->binding);
 	if (priv->expr != NULL)
 		nft_expr_destroy(ctx, priv->expr);
+
+	nf_tables_destroy_set(ctx, priv->set);
 }
 
 static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -279,6 +296,8 @@ static const struct nft_expr_ops nft_dynset_ops = {
 	.eval		= nft_dynset_eval,
 	.init		= nft_dynset_init,
 	.destroy	= nft_dynset_destroy,
+	.activate	= nft_dynset_activate,
+	.deactivate	= nft_dynset_deactivate,
 	.dump		= nft_dynset_dump,
 };
 
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index ad13e8643599..227b2b15a19c 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -121,12 +121,28 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
+static void nft_lookup_activate(const struct nft_ctx *ctx,
+				const struct nft_expr *expr)
+{
+	struct nft_lookup *priv = nft_expr_priv(expr);
+
+	nf_tables_rebind_set(ctx, priv->set, &priv->binding);
+}
+
+static void nft_lookup_deactivate(const struct nft_ctx *ctx,
+				  const struct nft_expr *expr)
+{
+	struct nft_lookup *priv = nft_expr_priv(expr);
+
+	nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
 static void nft_lookup_destroy(const struct nft_ctx *ctx,
 			       const struct nft_expr *expr)
 {
 	struct nft_lookup *priv = nft_expr_priv(expr);
 
-	nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+	nf_tables_destroy_set(ctx, priv->set);
 }
 
 static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -209,6 +225,8 @@ static const struct nft_expr_ops nft_lookup_ops = {
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
 	.eval		= nft_lookup_eval,
 	.init		= nft_lookup_init,
+	.activate	= nft_lookup_activate,
+	.deactivate	= nft_lookup_deactivate,
 	.destroy	= nft_lookup_destroy,
 	.dump		= nft_lookup_dump,
 	.validate	= nft_lookup_validate,
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index cdf348f751ec..a3185ca2a3a9 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -155,12 +155,28 @@ nla_put_failure:
 	return -1;
 }
 
+static void nft_objref_map_activate(const struct nft_ctx *ctx,
+				    const struct nft_expr *expr)
+{
+	struct nft_objref_map *priv = nft_expr_priv(expr);
+
+	nf_tables_rebind_set(ctx, priv->set, &priv->binding);
+}
+
+static void nft_objref_map_deactivate(const struct nft_ctx *ctx,
+				      const struct nft_expr *expr)
+{
+	struct nft_objref_map *priv = nft_expr_priv(expr);
+
+	nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
 static void nft_objref_map_destroy(const struct nft_ctx *ctx,
 				   const struct nft_expr *expr)
 {
 	struct nft_objref_map *priv = nft_expr_priv(expr);
 
-	nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+	nf_tables_destroy_set(ctx, priv->set);
 }
 
 static struct nft_expr_type nft_objref_type;
@@ -169,6 +185,8 @@ static const struct nft_expr_ops nft_objref_map_ops = {
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
 	.eval		= nft_objref_map_eval,
 	.init		= nft_objref_map_init,
+	.activate	= nft_objref_map_activate,
+	.deactivate	= nft_objref_map_deactivate,
 	.destroy	= nft_objref_map_destroy,
 	.dump		= nft_objref_map_dump,
 };
-- 
cgit v1.2.3


From 0935d558840099b3679c67bb7468dc78fcbad940 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 29 Aug 2018 14:41:32 +0200
Subject: netfilter: nf_tables: asynchronous release

Release the committed transaction log from a work queue, moving
expensive synchronize_rcu out of the locked section and providing
opportunity to batch this.

On my test machine this cuts runtime of nft-test.py in half.
Based on earlier patch from Pablo Neira Ayuso.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  2 ++
 net/netfilter/nf_tables_api.c     | 56 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 52 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 2c33958f3e7a..841835a387e1 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1298,12 +1298,14 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext)
  *
  *	@list: used internally
  *	@msg_type: message type
+ *	@put_net: ctx->net needs to be put
  *	@ctx: transaction context
  *	@data: internal information related to the transaction
  */
 struct nft_trans {
 	struct list_head		list;
 	int				msg_type;
+	bool				put_net;
 	struct nft_ctx			ctx;
 	char				data[0];
 };
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d4c531e0a26f..72dbdb1faa3c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -27,6 +27,8 @@
 static LIST_HEAD(nf_tables_expressions);
 static LIST_HEAD(nf_tables_objects);
 static LIST_HEAD(nf_tables_flowtables);
+static LIST_HEAD(nf_tables_destroy_list);
+static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
 static u64 table_handle;
 
 enum {
@@ -64,6 +66,8 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state)
 
 	net->nft.validate_state = new_validate_state;
 }
+static void nf_tables_trans_destroy_work(struct work_struct *w);
+static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
 
 static void nft_ctx_init(struct nft_ctx *ctx,
 			 struct net *net,
@@ -2453,7 +2457,6 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
 {
 	struct nft_expr *expr;
 
-	lockdep_assert_held(&ctx->net->nft.commit_mutex);
 	/*
 	 * Careful: some expressions might not be initialized in case this
 	 * is called on error from nf_tables_newrule().
@@ -6224,19 +6227,28 @@ static void nft_commit_release(struct nft_trans *trans)
 		nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
 		break;
 	}
+
+	if (trans->put_net)
+		put_net(trans->ctx.net);
+
 	kfree(trans);
 }
 
-static void nf_tables_commit_release(struct net *net)
+static void nf_tables_trans_destroy_work(struct work_struct *w)
 {
 	struct nft_trans *trans, *next;
+	LIST_HEAD(head);
+
+	spin_lock(&nf_tables_destroy_list_lock);
+	list_splice_init(&nf_tables_destroy_list, &head);
+	spin_unlock(&nf_tables_destroy_list_lock);
 
-	if (list_empty(&net->nft.commit_list))
+	if (list_empty(&head))
 		return;
 
 	synchronize_rcu();
 
-	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+	list_for_each_entry_safe(trans, next, &head, list) {
 		list_del(&trans->list);
 		nft_commit_release(trans);
 	}
@@ -6367,6 +6379,37 @@ static void nft_chain_del(struct nft_chain *chain)
 	list_del_rcu(&chain->list);
 }
 
+static void nf_tables_commit_release(struct net *net)
+{
+	struct nft_trans *trans;
+
+	/* all side effects have to be made visible.
+	 * For example, if a chain named 'foo' has been deleted, a
+	 * new transaction must not find it anymore.
+	 *
+	 * Memory reclaim happens asynchronously from work queue
+	 * to prevent expensive synchronize_rcu() in commit phase.
+	 */
+	if (list_empty(&net->nft.commit_list)) {
+		mutex_unlock(&net->nft.commit_mutex);
+		return;
+	}
+
+	trans = list_last_entry(&net->nft.commit_list,
+				struct nft_trans, list);
+	get_net(trans->ctx.net);
+	WARN_ON_ONCE(trans->put_net);
+
+	trans->put_net = true;
+	spin_lock(&nf_tables_destroy_list_lock);
+	list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list);
+	spin_unlock(&nf_tables_destroy_list_lock);
+
+	mutex_unlock(&net->nft.commit_mutex);
+
+	schedule_work(&trans_destroy_work);
+}
+
 static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 {
 	struct nft_trans *trans, *next;
@@ -6528,9 +6571,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 		}
 	}
 
-	nf_tables_commit_release(net);
 	nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
-	mutex_unlock(&net->nft.commit_mutex);
+	nf_tables_commit_release(net);
 
 	return 0;
 }
@@ -7304,6 +7346,7 @@ static int __init nf_tables_module_init(void)
 {
 	int err;
 
+	spin_lock_init(&nf_tables_destroy_list_lock);
 	err = register_pernet_subsys(&nf_tables_net_ops);
 	if (err < 0)
 		return err;
@@ -7343,6 +7386,7 @@ static void __exit nf_tables_module_exit(void)
 	unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
 	nft_chain_filter_fini();
 	unregister_pernet_subsys(&nf_tables_net_ops);
+	cancel_work_sync(&trans_destroy_work);
 	rcu_barrier();
 	nf_tables_core_module_exit();
 }
-- 
cgit v1.2.3


From 2953d80ff04862b26a2e628fb3948868f54d753d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 31 Aug 2018 20:29:37 +0200
Subject: netfilter: remove obsolete need_conntrack stub

as of a0ae2562c6c4b27 ("netfilter: conntrack: remove l3proto
abstraction") there are no users anymore.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_common.h | 3 ---
 net/netfilter/nf_conntrack_standalone.c       | 7 -------
 2 files changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 03097fa70975..e142b2b5f1ea 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -19,7 +19,4 @@ struct ip_conntrack_stat {
 	unsigned int search_restart;
 };
 
-/* call to create an explicit dependency on nf_conntrack. */
-void need_conntrack(void);
-
 #endif /* _NF_CONNTRACK_COMMON_H */
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 13279f683da9..e3b329ebafd3 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -720,10 +720,3 @@ static void __exit nf_conntrack_standalone_fini(void)
 
 module_init(nf_conntrack_standalone_init);
 module_exit(nf_conntrack_standalone_fini);
-
-/* Some modules need us, but don't depend directly on any symbol.
-   They should call this. */
-void need_conntrack(void)
-{
-}
-EXPORT_SYMBOL_GPL(need_conntrack);
-- 
cgit v1.2.3


From 6c47260250fc6114ce2012db13e1cd3938a27b73 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 3 Sep 2018 18:09:40 +0200
Subject: netfilter: nf_tables: add xfrm expression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

supports fetching saddr/daddr of tunnel mode states, request id and spi.
If direction is 'in', use inbound skb secpath, else dst->xfrm.

Joint work with Máté Eckl.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  29 +++
 net/netfilter/Kconfig                    |   7 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_xfrm.c                 | 293 +++++++++++++++++++++++++++++++
 4 files changed, 330 insertions(+)
 create mode 100644 net/netfilter/nft_xfrm.c

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 6c44cbbb2cda..702e4f0bec56 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1514,6 +1514,35 @@ enum nft_devices_attributes {
 };
 #define NFTA_DEVICE_MAX		(__NFTA_DEVICE_MAX - 1)
 
+/*
+ * enum nft_xfrm_attributes - nf_tables xfrm expr netlink attributes
+ *
+ * @NFTA_XFRM_DREG: destination register (NLA_U32)
+ * @NFTA_XFRM_KEY: enum nft_xfrm_keys (NLA_U32)
+ * @NFTA_XFRM_DIR: direction (NLA_U8)
+ * @NFTA_XFRM_SPNUM: index in secpath array (NLA_U32)
+ */
+enum nft_xfrm_attributes {
+	NFTA_XFRM_UNSPEC,
+	NFTA_XFRM_DREG,
+	NFTA_XFRM_KEY,
+	NFTA_XFRM_DIR,
+	NFTA_XFRM_SPNUM,
+	__NFTA_XFRM_MAX
+};
+#define NFTA_XFRM_MAX (__NFTA_XFRM_MAX - 1)
+
+enum nft_xfrm_keys {
+	NFT_XFRM_KEY_UNSPEC,
+	NFT_XFRM_KEY_DADDR_IP4,
+	NFT_XFRM_KEY_DADDR_IP6,
+	NFT_XFRM_KEY_SADDR_IP4,
+	NFT_XFRM_KEY_SADDR_IP6,
+	NFT_XFRM_KEY_REQID,
+	NFT_XFRM_KEY_SPI,
+	__NFT_XFRM_KEY_MAX,
+};
+#define NFT_XFRM_KEY_MAX (__NFT_XFRM_KEY_MAX - 1)
 
 /**
  * enum nft_trace_attributes - nf_tables trace netlink attributes
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f61c306de1d0..2ab870ef233a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -625,6 +625,13 @@ config NFT_FIB_INET
 	  The lookup will be delegated to the IPv4 or IPv6 FIB depending
 	  on the protocol of the packet.
 
+config NFT_XFRM
+	tristate "Netfilter nf_tables xfrm/IPSec security association matching"
+	depends on XFRM
+	help
+	  This option adds an expression that you can use to extract properties
+	  of a packets security association.
+
 config NFT_SOCKET
 	tristate "Netfilter nf_tables socket match support"
 	depends on IPV6 || IPV6=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 16895e045b66..4ddf3ef51ece 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -113,6 +113,7 @@ obj-$(CONFIG_NFT_FIB_NETDEV)	+= nft_fib_netdev.o
 obj-$(CONFIG_NFT_SOCKET)	+= nft_socket.o
 obj-$(CONFIG_NFT_OSF)		+= nft_osf.o
 obj-$(CONFIG_NFT_TPROXY)	+= nft_tproxy.o
+obj-$(CONFIG_NFT_XFRM)		+= nft_xfrm.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
new file mode 100644
index 000000000000..3cf71a2e375b
--- /dev/null
+++ b/net/netfilter/nft_xfrm.c
@@ -0,0 +1,293 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Generic part shared by ipv4 and ipv6 backends.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <linux/in.h>
+#include <net/xfrm.h>
+
+static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = {
+	[NFTA_XFRM_KEY]		= { .type = NLA_U32 },
+	[NFTA_XFRM_DIR]		= { .type = NLA_U8 },
+	[NFTA_XFRM_SPNUM]	= { .type = NLA_U32 },
+	[NFTA_XFRM_DREG]	= { .type = NLA_U32 },
+};
+
+struct nft_xfrm {
+	enum nft_xfrm_keys	key:8;
+	enum nft_registers	dreg:8;
+	u8			dir;
+	u8			spnum;
+};
+
+static int nft_xfrm_get_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
+{
+	struct nft_xfrm *priv = nft_expr_priv(expr);
+	unsigned int len = 0;
+	u32 spnum = 0;
+	u8 dir;
+
+	if (!tb[NFTA_XFRM_KEY] || !tb[NFTA_XFRM_DIR] || !tb[NFTA_XFRM_DREG])
+		return -EINVAL;
+
+	switch (ctx->family) {
+	case NFPROTO_IPV4:
+	case NFPROTO_IPV6:
+	case NFPROTO_INET:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->key = ntohl(nla_get_u32(tb[NFTA_XFRM_KEY]));
+	switch (priv->key) {
+	case NFT_XFRM_KEY_REQID:
+	case NFT_XFRM_KEY_SPI:
+		len = sizeof(u32);
+		break;
+	case NFT_XFRM_KEY_DADDR_IP4:
+	case NFT_XFRM_KEY_SADDR_IP4:
+		len = sizeof(struct in_addr);
+		break;
+	case NFT_XFRM_KEY_DADDR_IP6:
+	case NFT_XFRM_KEY_SADDR_IP6:
+		len = sizeof(struct in6_addr);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	dir = nla_get_u8(tb[NFTA_XFRM_DIR]);
+	switch (dir) {
+	case XFRM_POLICY_IN:
+	case XFRM_POLICY_OUT:
+		priv->dir = dir;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (tb[NFTA_XFRM_SPNUM])
+		spnum = ntohl(nla_get_be32(tb[NFTA_XFRM_SPNUM]));
+
+	if (spnum >= XFRM_MAX_DEPTH)
+		return -ERANGE;
+
+	priv->spnum = spnum;
+
+	priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]);
+	return nft_validate_register_store(ctx, priv->dreg, NULL,
+					   NFT_DATA_VALUE, len);
+}
+
+/* Return true if key asks for daddr/saddr and current
+ * state does have a valid address (BEET, TUNNEL).
+ */
+static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode)
+{
+	switch (k) {
+	case NFT_XFRM_KEY_DADDR_IP4:
+	case NFT_XFRM_KEY_SADDR_IP4:
+		if (family == NFPROTO_IPV4)
+			break;
+		return false;
+	case NFT_XFRM_KEY_DADDR_IP6:
+	case NFT_XFRM_KEY_SADDR_IP6:
+		if (family == NFPROTO_IPV6)
+			break;
+		return false;
+	default:
+		return true;
+	}
+
+	return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL;
+}
+
+static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
+				   struct nft_regs *regs,
+				   const struct xfrm_state *state,
+				   u8 family)
+{
+	u32 *dest = &regs->data[priv->dreg];
+
+	if (!xfrm_state_addr_ok(priv->key, family, state->props.mode)) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	switch (priv->key) {
+	case NFT_XFRM_KEY_UNSPEC:
+	case __NFT_XFRM_KEY_MAX:
+		WARN_ON_ONCE(1);
+		break;
+	case NFT_XFRM_KEY_DADDR_IP4:
+		*dest = state->id.daddr.a4;
+		return;
+	case NFT_XFRM_KEY_DADDR_IP6:
+		memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr));
+		return;
+	case NFT_XFRM_KEY_SADDR_IP4:
+		*dest = state->props.saddr.a4;
+		return;
+	case NFT_XFRM_KEY_SADDR_IP6:
+		memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr));
+		return;
+	case NFT_XFRM_KEY_REQID:
+		*dest = state->props.reqid;
+		return;
+	case NFT_XFRM_KEY_SPI:
+		*dest = state->id.spi;
+		return;
+	}
+
+	regs->verdict.code = NFT_BREAK;
+}
+
+static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv,
+				    struct nft_regs *regs,
+				    const struct nft_pktinfo *pkt)
+{
+	const struct sec_path *sp = pkt->skb->sp;
+	const struct xfrm_state *state;
+
+	if (sp == NULL || sp->len <= priv->spnum) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	state = sp->xvec[priv->spnum];
+	nft_xfrm_state_get_key(priv, regs, state, nft_pf(pkt));
+}
+
+static void nft_xfrm_get_eval_out(const struct nft_xfrm *priv,
+				  struct nft_regs *regs,
+				  const struct nft_pktinfo *pkt)
+{
+	const struct dst_entry *dst = skb_dst(pkt->skb);
+	int i;
+
+	for (i = 0; dst && dst->xfrm;
+	     dst = ((const struct xfrm_dst *)dst)->child, i++) {
+		if (i < priv->spnum)
+			continue;
+
+		nft_xfrm_state_get_key(priv, regs, dst->xfrm, nft_pf(pkt));
+		return;
+	}
+
+	regs->verdict.code = NFT_BREAK;
+}
+
+static void nft_xfrm_get_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	const struct nft_xfrm *priv = nft_expr_priv(expr);
+
+	switch (priv->dir) {
+	case XFRM_POLICY_IN:
+		nft_xfrm_get_eval_in(priv, regs, pkt);
+		break;
+	case XFRM_POLICY_OUT:
+		nft_xfrm_get_eval_out(priv, regs, pkt);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		regs->verdict.code = NFT_BREAK;
+		break;
+	}
+}
+
+static int nft_xfrm_get_dump(struct sk_buff *skb,
+			     const struct nft_expr *expr)
+{
+	const struct nft_xfrm *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_XFRM_DREG, priv->dreg))
+		return -1;
+
+	if (nla_put_be32(skb, NFTA_XFRM_KEY, htonl(priv->key)))
+		return -1;
+	if (nla_put_u8(skb, NFTA_XFRM_DIR, priv->dir))
+		return -1;
+	if (nla_put_be32(skb, NFTA_XFRM_SPNUM, htonl(priv->spnum)))
+		return -1;
+
+	return 0;
+}
+
+static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			     const struct nft_data **data)
+{
+	const struct nft_xfrm *priv = nft_expr_priv(expr);
+	unsigned int hooks;
+
+	switch (priv->dir) {
+	case XFRM_POLICY_IN:
+		hooks = (1 << NF_INET_FORWARD) |
+			(1 << NF_INET_LOCAL_IN) |
+			(1 << NF_INET_PRE_ROUTING);
+		break;
+	case XFRM_POLICY_OUT:
+		hooks = (1 << NF_INET_FORWARD) |
+			(1 << NF_INET_LOCAL_OUT) |
+			(1 << NF_INET_POST_ROUTING);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
+
+	return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
+
+static struct nft_expr_type nft_xfrm_type;
+static const struct nft_expr_ops nft_xfrm_get_ops = {
+	.type		= &nft_xfrm_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_xfrm)),
+	.eval		= nft_xfrm_get_eval,
+	.init		= nft_xfrm_get_init,
+	.dump		= nft_xfrm_get_dump,
+	.validate	= nft_xfrm_validate,
+};
+
+static struct nft_expr_type nft_xfrm_type __read_mostly = {
+	.name		= "xfrm",
+	.ops		= &nft_xfrm_get_ops,
+	.policy		= nft_xfrm_policy,
+	.maxattr	= NFTA_XFRM_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_xfrm_module_init(void)
+{
+	return nft_register_expr(&nft_xfrm_type);
+}
+
+static void __exit nft_xfrm_module_exit(void)
+{
+	nft_unregister_expr(&nft_xfrm_type);
+}
+
+module_init(nft_xfrm_module_init);
+module_exit(nft_xfrm_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("nf_tables: xfrm/IPSec matching");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_AUTHOR("Máté Eckl <ecklm94@gmail.com>");
+MODULE_ALIAS_NFT_EXPR("xfrm");
-- 
cgit v1.2.3


From 0d704967f4a49cc2212350b3e4a8231f8b4283ed Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 4 Sep 2018 12:07:55 +0200
Subject: netfilter: xt_cgroup: shrink size of v2 path

cgroup v2 path field is PATH_MAX which is too large, this is placing too
much pressure on memory allocation for people with many rules doing
cgroup v1 classid matching, side effects of this are bug reports like:

https://bugzilla.kernel.org/show_bug.cgi?id=200639

This patch registers a new revision that shrinks the cgroup path to 512
bytes, which is the same approach we follow in similar extensions that
have a path field.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/uapi/linux/netfilter/xt_cgroup.h | 16 +++++++
 net/netfilter/xt_cgroup.c                | 72 ++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/xt_cgroup.h b/include/uapi/linux/netfilter/xt_cgroup.h
index e96dfa1b34f7..b74e370d6133 100644
--- a/include/uapi/linux/netfilter/xt_cgroup.h
+++ b/include/uapi/linux/netfilter/xt_cgroup.h
@@ -22,4 +22,20 @@ struct xt_cgroup_info_v1 {
 	void		*priv __attribute__((aligned(8)));
 };
 
+#define XT_CGROUP_PATH_MAX	512
+
+struct xt_cgroup_info_v2 {
+	__u8		has_path;
+	__u8		has_classid;
+	__u8		invert_path;
+	__u8		invert_classid;
+	union {
+		char	path[XT_CGROUP_PATH_MAX];
+		__u32	classid;
+	};
+
+	/* kernel internal data */
+	void		*priv __attribute__((aligned(8)));
+};
+
 #endif /* _UAPI_XT_CGROUP_H */
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 5d92e1781980..5cb1ecb29ea4 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -68,6 +68,38 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par)
 	return 0;
 }
 
+static int cgroup_mt_check_v2(const struct xt_mtchk_param *par)
+{
+	struct xt_cgroup_info_v2 *info = par->matchinfo;
+	struct cgroup *cgrp;
+
+	if ((info->invert_path & ~1) || (info->invert_classid & ~1))
+		return -EINVAL;
+
+	if (!info->has_path && !info->has_classid) {
+		pr_info("xt_cgroup: no path or classid specified\n");
+		return -EINVAL;
+	}
+
+	if (info->has_path && info->has_classid) {
+		pr_info_ratelimited("path and classid specified\n");
+		return -EINVAL;
+	}
+
+	info->priv = NULL;
+	if (info->has_path) {
+		cgrp = cgroup_get_from_path(info->path);
+		if (IS_ERR(cgrp)) {
+			pr_info_ratelimited("invalid path, errno=%ld\n",
+					    PTR_ERR(cgrp));
+			return -EINVAL;
+		}
+		info->priv = cgrp;
+	}
+
+	return 0;
+}
+
 static bool
 cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
 {
@@ -99,6 +131,24 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 			info->invert_classid;
 }
 
+static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_cgroup_info_v2 *info = par->matchinfo;
+	struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data;
+	struct cgroup *ancestor = info->priv;
+	struct sock *sk = skb->sk;
+
+	if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk)))
+		return false;
+
+	if (ancestor)
+		return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
+			info->invert_path;
+	else
+		return (info->classid == sock_cgroup_classid(skcd)) ^
+			info->invert_classid;
+}
+
 static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par)
 {
 	struct xt_cgroup_info_v1 *info = par->matchinfo;
@@ -107,6 +157,14 @@ static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par)
 		cgroup_put(info->priv);
 }
 
+static void cgroup_mt_destroy_v2(const struct xt_mtdtor_param *par)
+{
+	struct xt_cgroup_info_v2 *info = par->matchinfo;
+
+	if (info->priv)
+		cgroup_put(info->priv);
+}
+
 static struct xt_match cgroup_mt_reg[] __read_mostly = {
 	{
 		.name		= "cgroup",
@@ -134,6 +192,20 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = {
 				  (1 << NF_INET_POST_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
 	},
+	{
+		.name		= "cgroup",
+		.revision	= 2,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= cgroup_mt_check_v2,
+		.match		= cgroup_mt_v2,
+		.matchsize	= sizeof(struct xt_cgroup_info_v2),
+		.usersize	= offsetof(struct xt_cgroup_info_v2, priv),
+		.destroy	= cgroup_mt_destroy_v2,
+		.me		= THIS_MODULE,
+		.hooks		= (1 << NF_INET_LOCAL_OUT) |
+				  (1 << NF_INET_POST_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+	},
 };
 
 static int __init cgroup_mt_init(void)
-- 
cgit v1.2.3


From 7a3dd8c8979ce48b99cb0e9b7435a97f0716138a Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 14 Sep 2018 13:01:46 -0700
Subject: tls: async support causes out-of-bounds access in crypto APIs

When async support was added it needed to access the sk from the async
callback to report errors up the stack. The patch tried to use space
after the aead request struct by directly setting the reqsize field in
aead_request. This is an internal field that should not be used
outside the crypto APIs. It is used by the crypto code to define extra
space for private structures used in the crypto context. Users of the
API then use crypto_aead_reqsize() and add the returned amount of
bytes to the end of the request memory allocation before posting the
request to encrypt/decrypt APIs.

So this breaks (with general protection fault and KASAN error, if
enabled) because the request sent to decrypt is shorter than required
causing the crypto API out-of-bounds errors. Also it seems unlikely the
sk is even valid by the time it gets to the callback because of memset
in crypto layer.

Anyways, fix this by holding the sk in the skb->sk field when the
callback is set up and because the skb is already passed through to
the callback handler via void* we can access it in the handler. Then
in the handler we need to be careful to NULL the pointer again before
kfree_skb. I added comments on both the setup (in tls_do_decryption)
and when we clear it from the crypto callback handler
tls_decrypt_done(). After this selftests pass again and fixes KASAN
errors/warnings.

Fixes: 94524d8fc965 ("net/tls: Add support for async decryption of tls records")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Reviewed-by: Vakul Garg <Vakul.garg@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h |  4 ----
 net/tls/tls_sw.c  | 39 +++++++++++++++++++++++----------------
 2 files changed, 23 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index cd0a65bd92f9..8630d28bd951 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -128,10 +128,6 @@ struct tls_sw_context_rx {
 	bool async_notify;
 };
 
-struct decrypt_req_ctx {
-	struct sock *sk;
-};
-
 struct tls_record_info {
 	struct list_head list;
 	u32 end_seq;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 8aa4c1dafd6a..f4aa7cdb01ca 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -122,25 +122,32 @@ static int skb_nsg(struct sk_buff *skb, int offset, int len)
 static void tls_decrypt_done(struct crypto_async_request *req, int err)
 {
 	struct aead_request *aead_req = (struct aead_request *)req;
-	struct decrypt_req_ctx *req_ctx =
-			(struct decrypt_req_ctx *)(aead_req + 1);
-
 	struct scatterlist *sgout = aead_req->dst;
-
-	struct tls_context *tls_ctx = tls_get_ctx(req_ctx->sk);
-	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
-	int pending = atomic_dec_return(&ctx->decrypt_pending);
+	struct tls_sw_context_rx *ctx;
+	struct tls_context *tls_ctx;
 	struct scatterlist *sg;
+	struct sk_buff *skb;
 	unsigned int pages;
+	int pending;
+
+	skb = (struct sk_buff *)req->data;
+	tls_ctx = tls_get_ctx(skb->sk);
+	ctx = tls_sw_ctx_rx(tls_ctx);
+	pending = atomic_dec_return(&ctx->decrypt_pending);
 
 	/* Propagate if there was an err */
 	if (err) {
 		ctx->async_wait.err = err;
-		tls_err_abort(req_ctx->sk, err);
+		tls_err_abort(skb->sk, err);
 	}
 
+	/* After using skb->sk to propagate sk through crypto async callback
+	 * we need to NULL it again.
+	 */
+	skb->sk = NULL;
+
 	/* Release the skb, pages and memory allocated for crypto req */
-	kfree_skb(req->data);
+	kfree_skb(skb);
 
 	/* Skip the first S/G entry as it points to AAD */
 	for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) {
@@ -175,11 +182,13 @@ static int tls_do_decryption(struct sock *sk,
 			       (u8 *)iv_recv);
 
 	if (async) {
-		struct decrypt_req_ctx *req_ctx;
-
-		req_ctx = (struct decrypt_req_ctx *)(aead_req + 1);
-		req_ctx->sk = sk;
-
+		/* Using skb->sk to push sk through to crypto async callback
+		 * handler. This allows propagating errors up to the socket
+		 * if needed. It _must_ be cleared in the async handler
+		 * before kfree_skb is called. We _know_ skb->sk is NULL
+		 * because it is a clone from strparser.
+		 */
+		skb->sk = sk;
 		aead_request_set_callback(aead_req,
 					  CRYPTO_TFM_REQ_MAY_BACKLOG,
 					  tls_decrypt_done, skb);
@@ -1463,8 +1472,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		goto free_aead;
 
 	if (sw_ctx_rx) {
-		(*aead)->reqsize = sizeof(struct decrypt_req_ctx);
-
 		/* Set up strparser */
 		memset(&cb, 0, sizeof(cb));
 		cb.rcv_msg = tls_queue;
-- 
cgit v1.2.3


From 568b742a9d9888aca876b6ad9fa45490f18bee0a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 17 Sep 2018 11:57:28 +0200
Subject: netlink: add NLA_REJECT policy type

In some situations some netlink attributes may be used for output
only (kernel->userspace) or may be reserved for future use. It's
then helpful to be able to prevent userspace from using them in
messages sent to the kernel, since they'd otherwise be ignored and
any future will become impossible if this happens.

Add NLA_REJECT to the policy which does nothing but reject (with
EINVAL) validation of any messages containing this attribute.
Allow for returning a specific extended ACK error message in the
validation_data pointer.

While at it clear up the documentation a bit - the NLA_BITFIELD32
documentation was added to the list of len field descriptions.

Also, use NL_SET_BAD_ATTR() in one place where it's open-coded.

The specific case I have in mind now is a shared nested attribute
containing request/response data, and it would be pointless and
potentially confusing to have userspace include response data in
the messages that actually contain a request.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 13 ++++++++++++-
 lib/nlattr.c          | 23 ++++++++++++++++-------
 2 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 0c154f98e987..b318b0a9f6c3 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -180,6 +180,7 @@ enum {
 	NLA_S32,
 	NLA_S64,
 	NLA_BITFIELD32,
+	NLA_REJECT,
 	__NLA_TYPE_MAX,
 };
 
@@ -208,9 +209,19 @@ enum {
  *    NLA_MSECS            Leaving the length field zero will verify the
  *                         given type fits, using it verifies minimum length
  *                         just like "All other"
- *    NLA_BITFIELD32      A 32-bit bitmap/bitselector attribute
+ *    NLA_BITFIELD32       Unused
+ *    NLA_REJECT           Unused
  *    All other            Minimum length of attribute payload
  *
+ * Meaning of `validation_data' field:
+ *    NLA_BITFIELD32       This is a 32-bit bitmap/bitselector attribute and
+ *                         validation data must point to a u32 value of valid
+ *                         flags
+ *    NLA_REJECT           This attribute is always rejected and validation data
+ *                         may point to a string to report as the error instead
+ *                         of the generic one in extended ACK.
+ *    All other            Unused
+ *
  * Example:
  * static const struct nla_policy my_policy[ATTR_MAX+1] = {
  * 	[ATTR_FOO] = { .type = NLA_U16 },
diff --git a/lib/nlattr.c b/lib/nlattr.c
index e335bcafa9e4..36d74b079151 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -69,7 +69,8 @@ static int validate_nla_bitfield32(const struct nlattr *nla,
 }
 
 static int validate_nla(const struct nlattr *nla, int maxtype,
-			const struct nla_policy *policy)
+			const struct nla_policy *policy,
+			const char **error_msg)
 {
 	const struct nla_policy *pt;
 	int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla);
@@ -87,6 +88,11 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 	}
 
 	switch (pt->type) {
+	case NLA_REJECT:
+		if (pt->validation_data && error_msg)
+			*error_msg = pt->validation_data;
+		return -EINVAL;
+
 	case NLA_FLAG:
 		if (attrlen > 0)
 			return -ERANGE;
@@ -180,11 +186,10 @@ int nla_validate(const struct nlattr *head, int len, int maxtype,
 	int rem;
 
 	nla_for_each_attr(nla, head, len, rem) {
-		int err = validate_nla(nla, maxtype, policy);
+		int err = validate_nla(nla, maxtype, policy, NULL);
 
 		if (err < 0) {
-			if (extack)
-				extack->bad_attr = nla;
+			NL_SET_BAD_ATTR(extack, nla);
 			return err;
 		}
 	}
@@ -250,11 +255,15 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
 		u16 type = nla_type(nla);
 
 		if (type > 0 && type <= maxtype) {
+			static const char _msg[] = "Attribute failed policy validation";
+			const char *msg = _msg;
+
 			if (policy) {
-				err = validate_nla(nla, maxtype, policy);
+				err = validate_nla(nla, maxtype, policy, &msg);
 				if (err < 0) {
-					NL_SET_ERR_MSG_ATTR(extack, nla,
-							    "Attribute failed policy validation");
+					NL_SET_BAD_ATTR(extack, nla);
+					if (extack)
+						extack->_msg = msg;
 					goto errout;
 				}
 			}
-- 
cgit v1.2.3


From b60b87fc2996240e298529a46e122ef62ef9c27f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 17 Sep 2018 11:57:29 +0200
Subject: netlink: add ethernet address policy types

Commonly, ethernet addresses are just using a policy of
	{ .len = ETH_ALEN }
which leaves userspace free to send more data than it should,
which may hide bugs.

Introduce NLA_EXACT_LEN which checks for exact size, rejecting
the attribute if it's not exactly that length. Also add
NLA_EXACT_LEN_WARN which requires the minimum length and will
warn on longer attributes, for backward compatibility.

Use these to define NLA_POLICY_ETH_ADDR (new strict policy) and
NLA_POLICY_ETH_ADDR_COMPAT (compatible policy with warning);
these are used like this:

    static const struct nla_policy <name>[...] = {
        [NL_ATTR_NAME] = NLA_POLICY_ETH_ADDR,
        ...
    };

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 13 +++++++++++++
 lib/nlattr.c          |  8 +++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index b318b0a9f6c3..318b1ded3833 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -181,6 +181,8 @@ enum {
 	NLA_S64,
 	NLA_BITFIELD32,
 	NLA_REJECT,
+	NLA_EXACT_LEN,
+	NLA_EXACT_LEN_WARN,
 	__NLA_TYPE_MAX,
 };
 
@@ -211,6 +213,10 @@ enum {
  *                         just like "All other"
  *    NLA_BITFIELD32       Unused
  *    NLA_REJECT           Unused
+ *    NLA_EXACT_LEN        Attribute must have exactly this length, otherwise
+ *                         it is rejected.
+ *    NLA_EXACT_LEN_WARN   Attribute should have exactly this length, a warning
+ *                         is logged if it is longer, shorter is rejected.
  *    All other            Minimum length of attribute payload
  *
  * Meaning of `validation_data' field:
@@ -236,6 +242,13 @@ struct nla_policy {
 	void            *validation_data;
 };
 
+#define NLA_POLICY_EXACT_LEN(_len)	{ .type = NLA_EXACT_LEN, .len = _len }
+#define NLA_POLICY_EXACT_LEN_WARN(_len)	{ .type = NLA_EXACT_LEN_WARN, \
+					  .len = _len }
+
+#define NLA_POLICY_ETH_ADDR		NLA_POLICY_EXACT_LEN(ETH_ALEN)
+#define NLA_POLICY_ETH_ADDR_COMPAT	NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)
+
 /**
  * struct nl_info - netlink source information
  * @nlh: Netlink message header of original request
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 36d74b079151..bb6fe5ed4ecf 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -82,12 +82,18 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 
 	BUG_ON(pt->type > NLA_TYPE_MAX);
 
-	if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) {
+	if ((nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) ||
+	    (pt->type == NLA_EXACT_LEN_WARN && attrlen != pt->len)) {
 		pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
 				    current->comm, type);
 	}
 
 	switch (pt->type) {
+	case NLA_EXACT_LEN:
+		if (attrlen != pt->len)
+			return -ERANGE;
+		break;
+
 	case NLA_REJECT:
 		if (pt->validation_data && error_msg)
 			*error_msg = pt->validation_data;
-- 
cgit v1.2.3


From 14d73416792afa84f6a7245ee474d2432069da56 Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Mon, 17 Sep 2018 18:46:55 +0800
Subject: veth: rename pcpu_vstats as pcpu_lstats

struct pcpu_vstats and pcpu_lstats have same members and
usage, and pcpu_lstats is used in many files, so rename
pcpu_vstats as pcpu_lstats to reduce duplicate definition

Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/veth.c        | 22 ++++++++--------------
 include/linux/netdevice.h |  1 -
 2 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 8fc64b67f01e..224c56a4e2b1 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -37,12 +37,6 @@
 #define VETH_XDP_TX		BIT(0)
 #define VETH_XDP_REDIR		BIT(1)
 
-struct pcpu_vstats {
-	u64			packets;
-	u64			bytes;
-	struct u64_stats_sync	syncp;
-};
-
 struct veth_rq {
 	struct napi_struct	xdp_napi;
 	struct net_device	*dev;
@@ -217,7 +211,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	skb_tx_timestamp(skb);
 	if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
-		struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+		struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats);
 
 		u64_stats_update_begin(&stats->syncp);
 		stats->bytes += length;
@@ -236,7 +230,7 @@ drop:
 	return NETDEV_TX_OK;
 }
 
-static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
+static u64 veth_stats_one(struct pcpu_lstats *result, struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
 	int cpu;
@@ -244,7 +238,7 @@ static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
 	result->packets = 0;
 	result->bytes = 0;
 	for_each_possible_cpu(cpu) {
-		struct pcpu_vstats *stats = per_cpu_ptr(dev->vstats, cpu);
+		struct pcpu_lstats *stats = per_cpu_ptr(dev->lstats, cpu);
 		u64 packets, bytes;
 		unsigned int start;
 
@@ -264,7 +258,7 @@ static void veth_get_stats64(struct net_device *dev,
 {
 	struct veth_priv *priv = netdev_priv(dev);
 	struct net_device *peer;
-	struct pcpu_vstats one;
+	struct pcpu_lstats one;
 
 	tot->tx_dropped = veth_stats_one(&one, dev);
 	tot->tx_bytes = one.bytes;
@@ -830,13 +824,13 @@ static int veth_dev_init(struct net_device *dev)
 {
 	int err;
 
-	dev->vstats = netdev_alloc_pcpu_stats(struct pcpu_vstats);
-	if (!dev->vstats)
+	dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
+	if (!dev->lstats)
 		return -ENOMEM;
 
 	err = veth_alloc_queues(dev);
 	if (err) {
-		free_percpu(dev->vstats);
+		free_percpu(dev->lstats);
 		return err;
 	}
 
@@ -846,7 +840,7 @@ static int veth_dev_init(struct net_device *dev)
 static void veth_dev_free(struct net_device *dev)
 {
 	veth_free_queues(dev);
-	free_percpu(dev->vstats);
+	free_percpu(dev->lstats);
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index baed5d5088c5..1cbbf77a685f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2000,7 +2000,6 @@ struct net_device {
 		struct pcpu_lstats __percpu		*lstats;
 		struct pcpu_sw_netstats __percpu	*tstats;
 		struct pcpu_dstats __percpu		*dstats;
-		struct pcpu_vstats __percpu		*vstats;
 	};
 
 #if IS_ENABLED(CONFIG_GARP)
-- 
cgit v1.2.3


From 2dfd184abd38fd72d80715fa8b00c9de78490200 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 18 Sep 2018 16:20:18 -0400
Subject: flow_dissector: fix build failure without CONFIG_NET

If boolean CONFIG_BPF_SYSCALL is enabled, kernel/bpf/syscall.c will
call flow_dissector functions from net/core/flow_dissector.c.

This causes this build failure if CONFIG_NET is disabled:

    kernel/bpf/syscall.o: In function `__x64_sys_bpf':
    syscall.c:(.text+0x3278): undefined reference to
    `skb_flow_dissector_bpf_prog_attach'
    syscall.c:(.text+0x3310): undefined reference to
    `skb_flow_dissector_bpf_prog_detach'
    kernel/bpf/syscall.o:(.rodata+0x3f0): undefined reference to
    `flow_dissector_prog_ops'
    kernel/bpf/verifier.o:(.rodata+0x250): undefined reference to
    `flow_dissector_verifier_ops'

Analogous to other optional BPF program types in syscall.c, add stubs
if the relevant functions are not compiled and move the BPF_PROG_TYPE
definition in the #ifdef CONFIG_NET block.

Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_types.h |  2 +-
 include/linux/skbuff.h    | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 22083712dd18..c9bd6fb765b0 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -16,6 +16,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
+BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector)
 #endif
 #ifdef CONFIG_BPF_EVENTS
 BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
@@ -32,7 +33,6 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
 #ifdef CONFIG_INET
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
 #endif
-BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector)
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ce0e863f02a2..76be85ea392a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1194,10 +1194,23 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     const struct flow_dissector_key *key,
 			     unsigned int key_count);
 
+#ifdef CONFIG_NET
 int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
 				       struct bpf_prog *prog);
 
 int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
+#else
+static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
+						     struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+#endif
 
 bool __skb_flow_dissect(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
-- 
cgit v1.2.3


From 93e66024b0249cec81e91328c55a754efd3192e0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 12 Sep 2018 15:19:07 +0200
Subject: netfilter: conntrack: pass nf_hook_state to packet and error handlers

nf_hook_state contains all the hook meta-information: netns, protocol family,
hook location, and so on.

Instead of only passing selected information, pass a pointer to entire
structure.

This will allow to merge the error and the packet handlers and remove
the ->new() function in followup patches.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h    |  3 +-
 include/net/netfilter/nf_conntrack_l4proto.h |  7 ++--
 net/netfilter/nf_conntrack_core.c            | 49 ++++++++++++-------------
 net/netfilter/nf_conntrack_proto.c           |  8 ++---
 net/netfilter/nf_conntrack_proto_dccp.c      | 17 +++++----
 net/netfilter/nf_conntrack_proto_generic.c   |  3 +-
 net/netfilter/nf_conntrack_proto_gre.c       |  3 +-
 net/netfilter/nf_conntrack_proto_icmp.c      | 36 ++++++++++---------
 net/netfilter/nf_conntrack_proto_icmpv6.c    | 29 ++++++++-------
 net/netfilter/nf_conntrack_proto_sctp.c      | 12 ++++---
 net/netfilter/nf_conntrack_proto_tcp.c       | 28 ++++++++-------
 net/netfilter/nf_conntrack_proto_udp.c       | 54 +++++++++++++++-------------
 net/openvswitch/conntrack.c                  |  8 +++--
 13 files changed, 142 insertions(+), 115 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 2a3e0974a6af..afc9b3620473 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -20,8 +20,7 @@
 /* This header is used to share core functionality between the
    standalone connection tracking module, and the compatibility layer's use
    of connection tracking. */
-unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
-			     struct sk_buff *skb);
+unsigned int nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state);
 
 int nf_conntrack_init_net(struct net *net);
 void nf_conntrack_cleanup_net(struct net *net);
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 8465263b297d..a857a0adfb31 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -45,7 +45,8 @@ struct nf_conntrack_l4proto {
 	int (*packet)(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
-		      enum ip_conntrack_info ctinfo);
+		      enum ip_conntrack_info ctinfo,
+		      const struct nf_hook_state *state);
 
 	/* Called when a new connection for this protocol found;
 	 * returns TRUE if it's OK.  If so, packet() called next. */
@@ -55,9 +56,9 @@ struct nf_conntrack_l4proto {
 	/* Called when a conntrack entry is destroyed */
 	void (*destroy)(struct nf_conn *ct);
 
-	int (*error)(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
+	int (*error)(struct nf_conn *tmpl, struct sk_buff *skb,
 		     unsigned int dataoff,
-		     u_int8_t pf, unsigned int hooknum);
+		     const struct nf_hook_state *state);
 
 	/* called by gc worker if table is full */
 	bool (*can_early_drop)(const struct nf_conn *ct);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index a676d5f76bdc..8e275f4cdcdd 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1436,12 +1436,12 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 
 /* On success, returns 0, sets skb->_nfct | ctinfo */
 static int
-resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
+resolve_normal_ct(struct nf_conn *tmpl,
 		  struct sk_buff *skb,
 		  unsigned int dataoff,
-		  u_int16_t l3num,
 		  u_int8_t protonum,
-		  const struct nf_conntrack_l4proto *l4proto)
+		  const struct nf_conntrack_l4proto *l4proto,
+		  const struct nf_hook_state *state)
 {
 	const struct nf_conntrack_zone *zone;
 	struct nf_conntrack_tuple tuple;
@@ -1452,17 +1452,18 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	u32 hash;
 
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
-			     dataoff, l3num, protonum, net, &tuple, l4proto)) {
+			     dataoff, state->pf, protonum, state->net,
+			     &tuple, l4proto)) {
 		pr_debug("Can't get tuple\n");
 		return 0;
 	}
 
 	/* look for tuple match */
 	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
-	hash = hash_conntrack_raw(&tuple, net);
-	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
+	hash = hash_conntrack_raw(&tuple, state->net);
+	h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
 	if (!h) {
-		h = init_conntrack(net, tmpl, &tuple, l4proto,
+		h = init_conntrack(state->net, tmpl, &tuple, l4proto,
 				   skb, dataoff, hash);
 		if (!h)
 			return 0;
@@ -1492,12 +1493,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 }
 
 unsigned int
-nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
-		struct sk_buff *skb)
+nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
 {
 	const struct nf_conntrack_l4proto *l4proto;
-	struct nf_conn *ct, *tmpl;
 	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct, *tmpl;
 	u_int8_t protonum;
 	int dataoff, ret;
 
@@ -1506,32 +1506,32 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 		/* Previously seen (loopback or untracked)?  Ignore. */
 		if ((tmpl && !nf_ct_is_template(tmpl)) ||
 		     ctinfo == IP_CT_UNTRACKED) {
-			NF_CT_STAT_INC_ATOMIC(net, ignore);
+			NF_CT_STAT_INC_ATOMIC(state->net, ignore);
 			return NF_ACCEPT;
 		}
 		skb->_nfct = 0;
 	}
 
 	/* rcu_read_lock()ed by nf_hook_thresh */
-	dataoff = get_l4proto(skb, skb_network_offset(skb), pf, &protonum);
+	dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
 	if (dataoff <= 0) {
 		pr_debug("not prepared to track yet or error occurred\n");
-		NF_CT_STAT_INC_ATOMIC(net, error);
-		NF_CT_STAT_INC_ATOMIC(net, invalid);
+		NF_CT_STAT_INC_ATOMIC(state->net, error);
+		NF_CT_STAT_INC_ATOMIC(state->net, invalid);
 		ret = NF_ACCEPT;
 		goto out;
 	}
 
-	l4proto = __nf_ct_l4proto_find(pf, protonum);
+	l4proto = __nf_ct_l4proto_find(state->pf, protonum);
 
 	/* It may be an special packet, error, unclean...
 	 * inverse of the return code tells to the netfilter
 	 * core what to do with the packet. */
 	if (l4proto->error != NULL) {
-		ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum);
+		ret = l4proto->error(tmpl, skb, dataoff, state);
 		if (ret <= 0) {
-			NF_CT_STAT_INC_ATOMIC(net, error);
-			NF_CT_STAT_INC_ATOMIC(net, invalid);
+			NF_CT_STAT_INC_ATOMIC(state->net, error);
+			NF_CT_STAT_INC_ATOMIC(state->net, invalid);
 			ret = -ret;
 			goto out;
 		}
@@ -1540,10 +1540,11 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 			goto out;
 	}
 repeat:
-	ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l4proto);
+	ret = resolve_normal_ct(tmpl, skb, dataoff,
+				protonum, l4proto, state);
 	if (ret < 0) {
 		/* Too stressed to deal. */
-		NF_CT_STAT_INC_ATOMIC(net, drop);
+		NF_CT_STAT_INC_ATOMIC(state->net, drop);
 		ret = NF_DROP;
 		goto out;
 	}
@@ -1551,21 +1552,21 @@ repeat:
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct) {
 		/* Not valid part of a connection */
-		NF_CT_STAT_INC_ATOMIC(net, invalid);
+		NF_CT_STAT_INC_ATOMIC(state->net, invalid);
 		ret = NF_ACCEPT;
 		goto out;
 	}
 
-	ret = l4proto->packet(ct, skb, dataoff, ctinfo);
+	ret = l4proto->packet(ct, skb, dataoff, ctinfo, state);
 	if (ret <= 0) {
 		/* Invalid: inverse of the return code tells
 		 * the netfilter core what to do */
 		pr_debug("nf_conntrack_in: Can't track with proto module\n");
 		nf_conntrack_put(&ct->ct_general);
 		skb->_nfct = 0;
-		NF_CT_STAT_INC_ATOMIC(net, invalid);
+		NF_CT_STAT_INC_ATOMIC(state->net, invalid);
 		if (ret == -NF_DROP)
-			NF_CT_STAT_INC_ATOMIC(net, drop);
+			NF_CT_STAT_INC_ATOMIC(state->net, drop);
 		/* Special case: TCP tracker reports an attempt to reopen a
 		 * closed/aborted connection. We have to go back and create a
 		 * fresh conntrack.
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 51c5d7eec0a3..4896ba44becb 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -455,7 +455,7 @@ static unsigned int ipv4_conntrack_in(void *priv,
 				      struct sk_buff *skb,
 				      const struct nf_hook_state *state)
 {
-	return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
+	return nf_conntrack_in(skb, state);
 }
 
 static unsigned int ipv4_conntrack_local(void *priv,
@@ -477,7 +477,7 @@ static unsigned int ipv4_conntrack_local(void *priv,
 		return NF_ACCEPT;
 	}
 
-	return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
+	return nf_conntrack_in(skb, state);
 }
 
 /* Connection tracking may drop packets, but never alters them, so
@@ -690,14 +690,14 @@ static unsigned int ipv6_conntrack_in(void *priv,
 				      struct sk_buff *skb,
 				      const struct nf_hook_state *state)
 {
-	return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
+	return nf_conntrack_in(skb, state);
 }
 
 static unsigned int ipv6_conntrack_local(void *priv,
 					 struct sk_buff *skb,
 					 const struct nf_hook_state *state)
 {
-	return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
+	return nf_conntrack_in(skb, state);
 }
 
 static unsigned int ipv6_helper(void *priv,
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index f3f91ed2c21a..8595c79742a2 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -439,7 +439,8 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh)
 }
 
 static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
-		       unsigned int dataoff, enum ip_conntrack_info ctinfo)
+		       unsigned int dataoff, enum ip_conntrack_info ctinfo,
+		       const struct nf_hook_state *state)
 {
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 	struct dccp_hdr _dh, *dh;
@@ -527,9 +528,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 	return NF_ACCEPT;
 }
 
-static int dccp_error(struct net *net, struct nf_conn *tmpl,
+static int dccp_error(struct nf_conn *tmpl,
 		      struct sk_buff *skb, unsigned int dataoff,
-		      u_int8_t pf, unsigned int hooknum)
+		      const struct nf_hook_state *state)
 {
 	struct dccp_hdr _dh, *dh;
 	unsigned int dccp_len = skb->len - dataoff;
@@ -557,9 +558,10 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,
 		}
 	}
 
-	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
-	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP,
-				pf)) {
+	if (state->hook == NF_INET_PRE_ROUTING &&
+	    state->net->ct.sysctl_checksum &&
+	    nf_checksum_partial(skb, state->hook, dataoff, cscov,
+				IPPROTO_DCCP, state->pf)) {
 		msg = "nf_ct_dccp: bad checksum ";
 		goto out_invalid;
 	}
@@ -572,7 +574,8 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,
 	return NF_ACCEPT;
 
 out_invalid:
-	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_DCCP, "%s", msg);
+	nf_l4proto_log_invalid(skb, state->net, state->pf,
+			       IPPROTO_DCCP, "%s", msg);
 	return -NF_ACCEPT;
 }
 
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 1df3244ecd07..9940161cfef1 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -46,7 +46,8 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb,
 static int generic_packet(struct nf_conn *ct,
 			  const struct sk_buff *skb,
 			  unsigned int dataoff,
-			  enum ip_conntrack_info ctinfo)
+			  enum ip_conntrack_info ctinfo,
+			  const struct nf_hook_state *state)
 {
 	const unsigned int *timeout = nf_ct_timeout_lookup(ct);
 
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 650eb4fba2c5..85313a191456 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -235,7 +235,8 @@ static unsigned int *gre_get_timeouts(struct net *net)
 static int gre_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
-		      enum ip_conntrack_info ctinfo)
+		      enum ip_conntrack_info ctinfo,
+		      const struct nf_hook_state *state)
 {
 	/* If we've seen traffic both ways, this is a GRE connection.
 	 * Extend timeout. */
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 43c7e1a217b9..c3a304b53245 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -81,7 +81,8 @@ static unsigned int *icmp_get_timeouts(struct net *net)
 static int icmp_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
-		       enum ip_conntrack_info ctinfo)
+		       enum ip_conntrack_info ctinfo,
+		       const struct nf_hook_state *state)
 {
 	/* Do not immediately delete the connection after the first
 	   successful reply to avoid excessive conntrackd traffic
@@ -120,8 +121,8 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
 
 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
 static int
-icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
-		 unsigned int hooknum)
+icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb,
+		   const struct nf_hook_state *state)
 {
 	struct nf_conntrack_tuple innertuple, origtuple;
 	const struct nf_conntrack_l4proto *innerproto;
@@ -137,7 +138,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	if (!nf_ct_get_tuplepr(skb,
 			       skb_network_offset(skb) + ip_hdrlen(skb)
 						       + sizeof(struct icmphdr),
-			       PF_INET, net, &origtuple)) {
+			       PF_INET, state->net, &origtuple)) {
 		pr_debug("icmp_error_message: failed to get tuple\n");
 		return -NF_ACCEPT;
 	}
@@ -154,7 +155,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 
 	ctinfo = IP_CT_RELATED;
 
-	h = nf_conntrack_find_get(net, zone, &innertuple);
+	h = nf_conntrack_find_get(state->net, zone, &innertuple);
 	if (!h) {
 		pr_debug("icmp_error_message: no match\n");
 		return -NF_ACCEPT;
@@ -168,17 +169,19 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	return NF_ACCEPT;
 }
 
-static void icmp_error_log(const struct sk_buff *skb, struct net *net,
-			   u8 pf, const char *msg)
+static void icmp_error_log(const struct sk_buff *skb,
+			   const struct nf_hook_state *state,
+			   const char *msg)
 {
-	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg);
+	nf_l4proto_log_invalid(skb, state->net, state->pf,
+			       IPPROTO_ICMP, "%s", msg);
 }
 
 /* Small and modified version of icmp_rcv */
 static int
-icmp_error(struct net *net, struct nf_conn *tmpl,
+icmp_error(struct nf_conn *tmpl,
 	   struct sk_buff *skb, unsigned int dataoff,
-	   u8 pf, unsigned int hooknum)
+	   const struct nf_hook_state *state)
 {
 	const struct icmphdr *icmph;
 	struct icmphdr _ih;
@@ -186,14 +189,15 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 	/* Not enough header? */
 	icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
 	if (icmph == NULL) {
-		icmp_error_log(skb, net, pf, "short packet");
+		icmp_error_log(skb, state, "short packet");
 		return -NF_ACCEPT;
 	}
 
 	/* See ip_conntrack_proto_tcp.c */
-	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
-	    nf_ip_checksum(skb, hooknum, dataoff, 0)) {
-		icmp_error_log(skb, net, pf, "bad hw icmp checksum");
+	if (state->net->ct.sysctl_checksum &&
+	    state->hook == NF_INET_PRE_ROUTING &&
+	    nf_ip_checksum(skb, state->hook, dataoff, 0)) {
+		icmp_error_log(skb, state, "bad hw icmp checksum");
 		return -NF_ACCEPT;
 	}
 
@@ -204,7 +208,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 	 *		  discarded.
 	 */
 	if (icmph->type > NR_ICMP_TYPES) {
-		icmp_error_log(skb, net, pf, "invalid icmp type");
+		icmp_error_log(skb, state, "invalid icmp type");
 		return -NF_ACCEPT;
 	}
 
@@ -216,7 +220,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 	    icmph->type != ICMP_REDIRECT)
 		return NF_ACCEPT;
 
-	return icmp_error_message(net, tmpl, skb, hooknum);
+	return icmp_error_message(tmpl, skb, state);
 }
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 97e40f77d678..bb5c98b0af89 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -94,7 +94,8 @@ static unsigned int *icmpv6_get_timeouts(struct net *net)
 static int icmpv6_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
-		       enum ip_conntrack_info ctinfo)
+		       enum ip_conntrack_info ctinfo,
+		       const struct nf_hook_state *state)
 {
 	unsigned int *timeout = nf_ct_timeout_lookup(ct);
 
@@ -179,16 +180,19 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 	return NF_ACCEPT;
 }
 
-static void icmpv6_error_log(const struct sk_buff *skb, struct net *net,
-			     u8 pf, const char *msg)
+static void icmpv6_error_log(const struct sk_buff *skb,
+			     const struct nf_hook_state *state,
+			     const char *msg)
 {
-	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg);
+	nf_l4proto_log_invalid(skb, state->net, state->pf,
+			       IPPROTO_ICMPV6, "%s", msg);
 }
 
 static int
-icmpv6_error(struct net *net, struct nf_conn *tmpl,
-	     struct sk_buff *skb, unsigned int dataoff,
-	     u8 pf, unsigned int hooknum)
+icmpv6_error(struct nf_conn *tmpl,
+	     struct sk_buff *skb,
+	     unsigned int dataoff,
+	     const struct nf_hook_state *state)
 {
 	const struct icmp6hdr *icmp6h;
 	struct icmp6hdr _ih;
@@ -196,13 +200,14 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 
 	icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
 	if (icmp6h == NULL) {
-		icmpv6_error_log(skb, net, pf, "short packet");
+		icmpv6_error_log(skb, state, "short packet");
 		return -NF_ACCEPT;
 	}
 
-	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
-	    nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
-		icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed");
+	if (state->hook == NF_INET_PRE_ROUTING &&
+	    state->net->ct.sysctl_checksum &&
+	    nf_ip6_checksum(skb, state->hook, dataoff, IPPROTO_ICMPV6)) {
+		icmpv6_error_log(skb, state, "ICMPv6 checksum failed");
 		return -NF_ACCEPT;
 	}
 
@@ -217,7 +222,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	if (icmp6h->icmp6_type >= 128)
 		return NF_ACCEPT;
 
-	return icmpv6_error_message(net, tmpl, skb, dataoff);
+	return icmpv6_error_message(state->net, tmpl, skb, dataoff);
 }
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index e4d738d34cd0..34b80cea4a56 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -277,7 +277,8 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
 static int sctp_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
-		       enum ip_conntrack_info ctinfo)
+		       enum ip_conntrack_info ctinfo,
+		       const struct nf_hook_state *state)
 {
 	enum sctp_conntrack new_state, old_state;
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -471,9 +472,9 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	return true;
 }
 
-static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
+static int sctp_error(struct nf_conn *tpl, struct sk_buff *skb,
 		      unsigned int dataoff,
-		      u8 pf, unsigned int hooknum)
+		      const struct nf_hook_state *state)
 {
 	const struct sctphdr *sh;
 	const char *logmsg;
@@ -482,7 +483,8 @@ static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
 		logmsg = "nf_ct_sctp: short packet ";
 		goto out_invalid;
 	}
-	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	if (state->hook == NF_INET_PRE_ROUTING &&
+	    state->net->ct.sysctl_checksum &&
 	    skb->ip_summed == CHECKSUM_NONE) {
 		if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) {
 			logmsg = "nf_ct_sctp: failed to read header ";
@@ -497,7 +499,7 @@ static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
 	}
 	return NF_ACCEPT;
 out_invalid:
-	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_SCTP, "%s", logmsg);
+	nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_SCTP, "%s", logmsg);
 	return -NF_ACCEPT;
 }
 
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b4bdf9eda7b7..5128f0a79ed4 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -717,18 +717,18 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
 	[TCPHDR_ACK|TCPHDR_URG]			= 1,
 };
 
-static void tcp_error_log(const struct sk_buff *skb, struct net *net,
-			  u8 pf, const char *msg)
+static void tcp_error_log(const struct sk_buff *skb,
+			  const struct nf_hook_state *state,
+			  const char *msg)
 {
-	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_TCP, "%s", msg);
+	nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg);
 }
 
 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
-static int tcp_error(struct net *net, struct nf_conn *tmpl,
+static int tcp_error(struct nf_conn *tmpl,
 		     struct sk_buff *skb,
 		     unsigned int dataoff,
-		     u_int8_t pf,
-		     unsigned int hooknum)
+		     const struct nf_hook_state *state)
 {
 	const struct tcphdr *th;
 	struct tcphdr _tcph;
@@ -738,13 +738,13 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
 	/* Smaller that minimal TCP header? */
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	if (th == NULL) {
-		tcp_error_log(skb, net, pf, "short packet");
+		tcp_error_log(skb, state, "short packet");
 		return -NF_ACCEPT;
 	}
 
 	/* Not whole TCP header or malformed packet */
 	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
-		tcp_error_log(skb, net, pf, "truncated packet");
+		tcp_error_log(skb, state, "truncated packet");
 		return -NF_ACCEPT;
 	}
 
@@ -753,16 +753,17 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
 	 * because the checksum is assumed to be correct.
 	 */
 	/* FIXME: Source route IP option packets --RR */
-	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
-	    nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
-		tcp_error_log(skb, net, pf, "bad checksum");
+	if (state->net->ct.sysctl_checksum &&
+	    state->hook == NF_INET_PRE_ROUTING &&
+	    nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) {
+		tcp_error_log(skb, state, "bad checksum");
 		return -NF_ACCEPT;
 	}
 
 	/* Check TCP flags. */
 	tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
 	if (!tcp_valid_flags[tcpflags]) {
-		tcp_error_log(skb, net, pf, "invalid tcp flag combination");
+		tcp_error_log(skb, state, "invalid tcp flag combination");
 		return -NF_ACCEPT;
 	}
 
@@ -773,7 +774,8 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
 static int tcp_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
-		      enum ip_conntrack_info ctinfo)
+		      enum ip_conntrack_info ctinfo,
+		      const struct nf_hook_state *state)
 {
 	struct net *net = nf_ct_net(ct);
 	struct nf_tcp_net *tn = tcp_pernet(net);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 3065fb8ef91b..bf59d32bba98 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -46,7 +46,8 @@ static unsigned int *udp_get_timeouts(struct net *net)
 static int udp_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
-		      enum ip_conntrack_info ctinfo)
+		      enum ip_conntrack_info ctinfo,
+		      const struct nf_hook_state *state)
 {
 	unsigned int *timeouts;
 
@@ -77,16 +78,17 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
 }
 
 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
-static void udplite_error_log(const struct sk_buff *skb, struct net *net,
-			      u8 pf, const char *msg)
+static void udplite_error_log(const struct sk_buff *skb,
+			      const struct nf_hook_state *state,
+			      const char *msg)
 {
-	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDPLITE, "%s", msg);
+	nf_l4proto_log_invalid(skb, state->net, state->pf,
+			       IPPROTO_UDPLITE, "%s", msg);
 }
 
-static int udplite_error(struct net *net, struct nf_conn *tmpl,
-			 struct sk_buff *skb,
+static int udplite_error(struct nf_conn *tmpl, struct sk_buff *skb,
 			 unsigned int dataoff,
-			 u8 pf, unsigned int hooknum)
+			 const struct nf_hook_state *state)
 {
 	unsigned int udplen = skb->len - dataoff;
 	const struct udphdr *hdr;
@@ -96,7 +98,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 	/* Header is too small? */
 	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
 	if (!hdr) {
-		udplite_error_log(skb, net, pf, "short packet");
+		udplite_error_log(skb, state, "short packet");
 		return -NF_ACCEPT;
 	}
 
@@ -104,21 +106,22 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 	if (cscov == 0) {
 		cscov = udplen;
 	} else if (cscov < sizeof(*hdr) || cscov > udplen) {
-		udplite_error_log(skb, net, pf, "invalid checksum coverage");
+		udplite_error_log(skb, state, "invalid checksum coverage");
 		return -NF_ACCEPT;
 	}
 
 	/* UDPLITE mandates checksums */
 	if (!hdr->check) {
-		udplite_error_log(skb, net, pf, "checksum missing");
+		udplite_error_log(skb, state, "checksum missing");
 		return -NF_ACCEPT;
 	}
 
 	/* Checksum invalid? Ignore. */
-	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
-	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
-				pf)) {
-		udplite_error_log(skb, net, pf, "bad checksum");
+	if (state->hook == NF_INET_PRE_ROUTING &&
+	    state->net->ct.sysctl_checksum &&
+	    nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_UDP,
+				state->pf)) {
+		udplite_error_log(skb, state, "bad checksum");
 		return -NF_ACCEPT;
 	}
 
@@ -126,16 +129,17 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 }
 #endif
 
-static void udp_error_log(const struct sk_buff *skb, struct net *net,
-			  u8 pf, const char *msg)
+static void udp_error_log(const struct sk_buff *skb,
+			  const struct nf_hook_state *state,
+			  const char *msg)
 {
-	nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDP, "%s", msg);
+	nf_l4proto_log_invalid(skb, state->net, state->pf,
+			       IPPROTO_UDP, "%s", msg);
 }
 
-static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
+static int udp_error(struct nf_conn *tmpl, struct sk_buff *skb,
 		     unsigned int dataoff,
-		     u_int8_t pf,
-		     unsigned int hooknum)
+		     const struct nf_hook_state *state)
 {
 	unsigned int udplen = skb->len - dataoff;
 	const struct udphdr *hdr;
@@ -144,13 +148,13 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	/* Header is too small? */
 	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
 	if (hdr == NULL) {
-		udp_error_log(skb, net, pf, "short packet");
+		udp_error_log(skb, state, "short packet");
 		return -NF_ACCEPT;
 	}
 
 	/* Truncated/malformed packets */
 	if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
-		udp_error_log(skb, net, pf, "truncated/malformed packet");
+		udp_error_log(skb, state, "truncated/malformed packet");
 		return -NF_ACCEPT;
 	}
 
@@ -162,9 +166,9 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	 * We skip checking packets on the outgoing path
 	 * because the checksum is assumed to be correct.
 	 * FIXME: Source route IP option packets --RR */
-	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
-	    nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
-		udp_error_log(skb, net, pf, "bad checksum");
+	if (state->net->ct.sysctl_checksum && state->hook == NF_INET_PRE_ROUTING &&
+	    nf_checksum(skb, state->hook, dataoff, IPPROTO_UDP, state->pf)) {
+		udp_error_log(skb, state, "bad checksum");
 		return -NF_ACCEPT;
 	}
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 86a75105af1a..9d26ff6caabd 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -933,6 +933,11 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 	struct nf_conn *ct;
 
 	if (!cached) {
+		struct nf_hook_state state = {
+			.hook = NF_INET_PRE_ROUTING,
+			.pf = info->family,
+			.net = net,
+		};
 		struct nf_conn *tmpl = info->ct;
 		int err;
 
@@ -944,8 +949,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 			nf_ct_set(skb, tmpl, IP_CT_NEW);
 		}
 
-		err = nf_conntrack_in(net, info->family,
-				      NF_INET_PRE_ROUTING, skb);
+		err = nf_conntrack_in(skb, &state);
 		if (err != NF_ACCEPT)
 			return -ENOENT;
 
-- 
cgit v1.2.3


From 9976fc6e6edbb0372f084a2ae8c1b8103b3bff1d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 12 Sep 2018 15:19:08 +0200
Subject: netfilter: conntrack: remove the l4proto->new() function

->new() gets invoked after ->error() and before ->packet() if
a conntrack lookup has found no result for the tuple.

We can fold it into ->packet() -- the packet() implementations
can check if the conntrack is confirmed (new) or not
(already in hash).

If its unconfirmed, the conntrack isn't in the hash yet so current
skb created a new conntrack entry.

Only relevant side effect -- if packet() doesn't return NF_ACCEPT
but -NF_ACCEPT (or drop), while the conntrack was just created,
then the newly allocated conntrack is freed right away, rather than not
created in the first place.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h |   5 -
 net/netfilter/nf_conntrack_core.c            |   6 --
 net/netfilter/nf_conntrack_proto_dccp.c      |  17 ++-
 net/netfilter/nf_conntrack_proto_generic.c   |  20 ++--
 net/netfilter/nf_conntrack_proto_gre.c       |  33 +++---
 net/netfilter/nf_conntrack_proto_icmp.c      |  28 ++---
 net/netfilter/nf_conntrack_proto_icmpv6.c    |  37 +++----
 net/netfilter/nf_conntrack_proto_sctp.c      | 144 ++++++++++++-------------
 net/netfilter/nf_conntrack_proto_tcp.c       | 156 +++++++++++++--------------
 net/netfilter/nf_conntrack_proto_udp.c       |  11 --
 10 files changed, 194 insertions(+), 263 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index a857a0adfb31..016958e67fcc 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -48,11 +48,6 @@ struct nf_conntrack_l4proto {
 		      enum ip_conntrack_info ctinfo,
 		      const struct nf_hook_state *state);
 
-	/* Called when a new connection for this protocol found;
-	 * returns TRUE if it's OK.  If so, packet() called next. */
-	bool (*new)(struct nf_conn *ct, const struct sk_buff *skb,
-		    unsigned int dataoff);
-
 	/* Called when a conntrack entry is destroyed */
 	void (*destroy)(struct nf_conn *ct);
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8e275f4cdcdd..dccc96e94d7c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1370,12 +1370,6 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 
 	timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
 
-	if (!l4proto->new(ct, skb, dataoff)) {
-		nf_conntrack_free(ct);
-		pr_debug("can't track with proto module\n");
-		return NULL;
-	}
-
 	if (timeout_ext)
 		nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
 				      GFP_ATOMIC);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 8595c79742a2..e7b5449ea883 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -389,18 +389,15 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net)
 	return &net->ct.nf_ct_proto.dccp;
 }
 
-static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
-		     unsigned int dataoff)
+static noinline bool
+dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
+	 const struct dccp_hdr *dh)
 {
 	struct net *net = nf_ct_net(ct);
 	struct nf_dccp_net *dn;
-	struct dccp_hdr _dh, *dh;
 	const char *msg;
 	u_int8_t state;
 
-	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
-	BUG_ON(dh == NULL);
-
 	state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
 	switch (state) {
 	default:
@@ -449,8 +446,12 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 	unsigned int *timeouts;
 
 	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
-	BUG_ON(dh == NULL);
+	if (!dh)
+		return NF_DROP;
+
 	type = dh->dccph_type;
+	if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh))
+		return -NF_ACCEPT;
 
 	if (type == DCCP_PKT_RESET &&
 	    !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
@@ -850,7 +851,6 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net)
 const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
 	.l3proto		= AF_INET,
 	.l4proto		= IPPROTO_DCCP,
-	.new			= dccp_new,
 	.packet			= dccp_packet,
 	.error			= dccp_error,
 	.can_early_drop		= dccp_can_early_drop,
@@ -883,7 +883,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
 const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
 	.l3proto		= AF_INET6,
 	.l4proto		= IPPROTO_DCCP,
-	.new			= dccp_new,
 	.packet			= dccp_packet,
 	.error			= dccp_error,
 	.can_early_drop		= dccp_can_early_drop,
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 9940161cfef1..deeb05c50f02 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -51,6 +51,12 @@ static int generic_packet(struct nf_conn *ct,
 {
 	const unsigned int *timeout = nf_ct_timeout_lookup(ct);
 
+	if (!nf_generic_should_process(nf_ct_protonum(ct))) {
+		pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n",
+			     nf_ct_protonum(ct));
+		return -NF_ACCEPT;
+	}
+
 	if (!timeout)
 		timeout = &generic_pernet(nf_ct_net(ct))->timeout;
 
@@ -58,19 +64,6 @@ static int generic_packet(struct nf_conn *ct,
 	return NF_ACCEPT;
 }
 
-/* Called when a new connection for this protocol found. */
-static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
-			unsigned int dataoff)
-{
-	bool ret;
-
-	ret = nf_generic_should_process(nf_ct_protonum(ct));
-	if (!ret)
-		pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n",
-			     nf_ct_protonum(ct));
-	return ret;
-}
-
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 
 #include <linux/netfilter/nfnetlink.h>
@@ -164,7 +157,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
 	.l4proto		= 255,
 	.pkt_to_tuple		= generic_pkt_to_tuple,
 	.packet			= generic_packet,
-	.new			= generic_new,
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	.ctnl_timeout		= {
 		.nlattr_to_obj	= generic_timeout_nlattr_to_obj,
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 85313a191456..a44bbee271cb 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -238,6 +238,18 @@ static int gre_packet(struct nf_conn *ct,
 		      enum ip_conntrack_info ctinfo,
 		      const struct nf_hook_state *state)
 {
+	if (!nf_ct_is_confirmed(ct)) {
+		unsigned int *timeouts = nf_ct_timeout_lookup(ct);
+
+		if (!timeouts)
+			timeouts = gre_get_timeouts(nf_ct_net(ct));
+
+		/* initialize to sane value.  Ideally a conntrack helper
+		 * (e.g. in case of pptp) is increasing them */
+		ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED];
+		ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
+	}
+
 	/* If we've seen traffic both ways, this is a GRE connection.
 	 * Extend timeout. */
 	if (ct->status & IPS_SEEN_REPLY) {
@@ -253,26 +265,6 @@ static int gre_packet(struct nf_conn *ct,
 	return NF_ACCEPT;
 }
 
-/* Called when a new connection for this protocol found. */
-static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb,
-		    unsigned int dataoff)
-{
-	unsigned int *timeouts = nf_ct_timeout_lookup(ct);
-
-	if (!timeouts)
-		timeouts = gre_get_timeouts(nf_ct_net(ct));
-
-	pr_debug(": ");
-	nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-
-	/* initialize to sane value.  Ideally a conntrack helper
-	 * (e.g. in case of pptp) is increasing them */
-	ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED];
-	ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
-
-	return true;
-}
-
 /* Called when a conntrack entry has already been removed from the hashes
  * and is about to be deleted from memory */
 static void gre_destroy(struct nf_conn *ct)
@@ -359,7 +351,6 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
 	.print_conntrack = gre_print_conntrack,
 #endif
 	.packet		 = gre_packet,
-	.new		 = gre_new,
 	.destroy	 = gre_destroy,
 	.me 		 = THIS_MODULE,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index c3a304b53245..19ef0c41602b 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -72,11 +72,6 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-static unsigned int *icmp_get_timeouts(struct net *net)
-{
-	return &icmp_pernet(net)->timeout;
-}
-
 /* Returns verdict for packet, or -1 for invalid. */
 static int icmp_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
@@ -88,19 +83,6 @@ static int icmp_packet(struct nf_conn *ct,
 	   successful reply to avoid excessive conntrackd traffic
 	   and also to handle correctly ICMP echo reply duplicates. */
 	unsigned int *timeout = nf_ct_timeout_lookup(ct);
-
-	if (!timeout)
-		timeout = icmp_get_timeouts(nf_ct_net(ct));
-
-	nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
-
-	return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
-		     unsigned int dataoff)
-{
 	static const u_int8_t valid_new[] = {
 		[ICMP_ECHO] = 1,
 		[ICMP_TIMESTAMP] = 1,
@@ -114,9 +96,14 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
 		pr_debug("icmp: can't create new conn with type %u\n",
 			 ct->tuplehash[0].tuple.dst.u.icmp.type);
 		nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple);
-		return false;
+		return -NF_ACCEPT;
 	}
-	return true;
+
+	if (!timeout)
+		timeout = &icmp_pernet(nf_ct_net(ct))->timeout;
+
+	nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
+	return NF_ACCEPT;
 }
 
 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
@@ -368,7 +355,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
 	.pkt_to_tuple		= icmp_pkt_to_tuple,
 	.invert_tuple		= icmp_invert_tuple,
 	.packet			= icmp_packet,
-	.new			= icmp_new,
 	.error			= icmp_error,
 	.destroy		= NULL,
 	.me			= NULL,
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index bb5c98b0af89..bb94363818e6 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -98,6 +98,22 @@ static int icmpv6_packet(struct nf_conn *ct,
 		       const struct nf_hook_state *state)
 {
 	unsigned int *timeout = nf_ct_timeout_lookup(ct);
+	static const u8 valid_new[] = {
+		[ICMPV6_ECHO_REQUEST - 128] = 1,
+		[ICMPV6_NI_QUERY - 128] = 1
+	};
+
+	if (!nf_ct_is_confirmed(ct)) {
+		int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128;
+
+		if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
+			/* Can't create a new ICMPv6 `conn' with this. */
+			pr_debug("icmpv6: can't create new conn with type %u\n",
+				 type + 128);
+			nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple);
+			return -NF_ACCEPT;
+		}
+	}
 
 	if (!timeout)
 		timeout = icmpv6_get_timeouts(nf_ct_net(ct));
@@ -110,26 +126,6 @@ static int icmpv6_packet(struct nf_conn *ct,
 	return NF_ACCEPT;
 }
 
-/* Called when a new connection for this protocol found. */
-static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
-		       unsigned int dataoff)
-{
-	static const u_int8_t valid_new[] = {
-		[ICMPV6_ECHO_REQUEST - 128] = 1,
-		[ICMPV6_NI_QUERY - 128] = 1
-	};
-	int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128;
-
-	if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
-		/* Can't create a new ICMPv6 `conn' with this. */
-		pr_debug("icmpv6: can't create new conn with type %u\n",
-			 type + 128);
-		nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple);
-		return false;
-	}
-	return true;
-}
-
 static int
 icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 		     struct sk_buff *skb,
@@ -370,7 +366,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
 	.pkt_to_tuple		= icmpv6_pkt_to_tuple,
 	.invert_tuple		= icmpv6_invert_tuple,
 	.packet			= icmpv6_packet,
-	.new			= icmpv6_new,
 	.error			= icmpv6_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= icmpv6_tuple_to_nlattr,
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 34b80cea4a56..78c115152a78 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -273,6 +273,63 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
 	return sctp_conntracks[dir][i][cur_state];
 }
 
+/* Don't need lock here: this conntrack not in circulation yet */
+static noinline bool
+sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
+	 const struct sctphdr *sh, unsigned int dataoff)
+{
+	enum sctp_conntrack new_state;
+	const struct sctp_chunkhdr *sch;
+	struct sctp_chunkhdr _sch;
+	u32 offset, count;
+
+	memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
+	new_state = SCTP_CONNTRACK_MAX;
+	for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) {
+		new_state = sctp_new_state(IP_CT_DIR_ORIGINAL,
+					   SCTP_CONNTRACK_NONE, sch->type);
+
+		/* Invalid: delete conntrack */
+		if (new_state == SCTP_CONNTRACK_NONE ||
+		    new_state == SCTP_CONNTRACK_MAX) {
+			pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
+			return false;
+		}
+
+		/* Copy the vtag into the state info */
+		if (sch->type == SCTP_CID_INIT) {
+			struct sctp_inithdr _inithdr, *ih;
+			/* Sec 8.5.1 (A) */
+			if (sh->vtag)
+				return false;
+
+			ih = skb_header_pointer(skb, offset + sizeof(_sch),
+						sizeof(_inithdr), &_inithdr);
+			if (!ih)
+				return false;
+
+			pr_debug("Setting vtag %x for new conn\n",
+				 ih->init_tag);
+
+			ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag;
+		} else if (sch->type == SCTP_CID_HEARTBEAT) {
+			pr_debug("Setting vtag %x for secondary conntrack\n",
+				 sh->vtag);
+			ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
+		} else {
+		/* If it is a shutdown ack OOTB packet, we expect a return
+		   shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
+			pr_debug("Setting vtag %x for new conn OOTB\n",
+				 sh->vtag);
+			ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
+		}
+
+		ct->proto.sctp.state = new_state;
+	}
+
+	return true;
+}
+
 /* Returns verdict for packet, or -NF_ACCEPT for invalid. */
 static int sctp_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
@@ -297,6 +354,17 @@ static int sctp_packet(struct nf_conn *ct,
 	if (do_basic_checks(ct, skb, dataoff, map) != 0)
 		goto out;
 
+	if (!nf_ct_is_confirmed(ct)) {
+		/* If an OOTB packet has any of these chunks discard (Sec 8.4) */
+		if (test_bit(SCTP_CID_ABORT, map) ||
+		    test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
+		    test_bit(SCTP_CID_COOKIE_ACK, map))
+			return -NF_ACCEPT;
+
+		if (!sctp_new(ct, skb, sh, dataoff))
+			return -NF_ACCEPT;
+	}
+
 	/* Check the verification tag (Sec 8.5) */
 	if (!test_bit(SCTP_CID_INIT, map) &&
 	    !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
@@ -398,80 +466,6 @@ out:
 	return -NF_ACCEPT;
 }
 
-/* Called when a new connection for this protocol found. */
-static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
-		     unsigned int dataoff)
-{
-	enum sctp_conntrack new_state;
-	const struct sctphdr *sh;
-	struct sctphdr _sctph;
-	const struct sctp_chunkhdr *sch;
-	struct sctp_chunkhdr _sch;
-	u_int32_t offset, count;
-	unsigned long map[256 / sizeof(unsigned long)] = { 0 };
-
-	sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
-	if (sh == NULL)
-		return false;
-
-	if (do_basic_checks(ct, skb, dataoff, map) != 0)
-		return false;
-
-	/* If an OOTB packet has any of these chunks discard (Sec 8.4) */
-	if (test_bit(SCTP_CID_ABORT, map) ||
-	    test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
-	    test_bit(SCTP_CID_COOKIE_ACK, map))
-		return false;
-
-	memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
-	new_state = SCTP_CONNTRACK_MAX;
-	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
-		/* Don't need lock here: this conntrack not in circulation yet */
-		new_state = sctp_new_state(IP_CT_DIR_ORIGINAL,
-					   SCTP_CONNTRACK_NONE, sch->type);
-
-		/* Invalid: delete conntrack */
-		if (new_state == SCTP_CONNTRACK_NONE ||
-		    new_state == SCTP_CONNTRACK_MAX) {
-			pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
-			return false;
-		}
-
-		/* Copy the vtag into the state info */
-		if (sch->type == SCTP_CID_INIT) {
-			struct sctp_inithdr _inithdr, *ih;
-			/* Sec 8.5.1 (A) */
-			if (sh->vtag)
-				return false;
-
-			ih = skb_header_pointer(skb, offset + sizeof(_sch),
-						sizeof(_inithdr), &_inithdr);
-			if (!ih)
-				return false;
-
-			pr_debug("Setting vtag %x for new conn\n",
-				 ih->init_tag);
-
-			ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag;
-		} else if (sch->type == SCTP_CID_HEARTBEAT) {
-			pr_debug("Setting vtag %x for secondary conntrack\n",
-				 sh->vtag);
-			ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
-		}
-		/* If it is a shutdown ack OOTB packet, we expect a return
-		   shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
-		else {
-			pr_debug("Setting vtag %x for new conn OOTB\n",
-				 sh->vtag);
-			ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
-		}
-
-		ct->proto.sctp.state = new_state;
-	}
-
-	return true;
-}
-
 static int sctp_error(struct nf_conn *tpl, struct sk_buff *skb,
 		      unsigned int dataoff,
 		      const struct nf_hook_state *state)
@@ -769,7 +763,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
 	.print_conntrack	= sctp_print_conntrack,
 #endif
 	.packet 		= sctp_packet,
-	.new 			= sctp_new,
 	.error			= sctp_error,
 	.can_early_drop		= sctp_can_early_drop,
 	.me 			= THIS_MODULE,
@@ -803,7 +796,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
 	.print_conntrack	= sctp_print_conntrack,
 #endif
 	.packet 		= sctp_packet,
-	.new 			= sctp_new,
 	.error			= sctp_error,
 	.can_early_drop		= sctp_can_early_drop,
 	.me 			= THIS_MODULE,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 5128f0a79ed4..6d278cdff145 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -770,6 +770,78 @@ static int tcp_error(struct nf_conn *tmpl,
 	return NF_ACCEPT;
 }
 
+static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
+			     unsigned int dataoff,
+			     const struct tcphdr *th)
+{
+	enum tcp_conntrack new_state;
+	struct net *net = nf_ct_net(ct);
+	const struct nf_tcp_net *tn = tcp_pernet(net);
+	const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
+	const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
+
+	/* Don't need lock here: this conntrack not in circulation yet */
+	new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
+
+	/* Invalid: delete conntrack */
+	if (new_state >= TCP_CONNTRACK_MAX) {
+		pr_debug("nf_ct_tcp: invalid new deleting.\n");
+		return false;
+	}
+
+	if (new_state == TCP_CONNTRACK_SYN_SENT) {
+		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
+		/* SYN packet */
+		ct->proto.tcp.seen[0].td_end =
+			segment_seq_plus_len(ntohl(th->seq), skb->len,
+					     dataoff, th);
+		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+		if (ct->proto.tcp.seen[0].td_maxwin == 0)
+			ct->proto.tcp.seen[0].td_maxwin = 1;
+		ct->proto.tcp.seen[0].td_maxend =
+			ct->proto.tcp.seen[0].td_end;
+
+		tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
+	} else if (tn->tcp_loose == 0) {
+		/* Don't try to pick up connections. */
+		return false;
+	} else {
+		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
+		/*
+		 * We are in the middle of a connection,
+		 * its history is lost for us.
+		 * Let's try to use the data from the packet.
+		 */
+		ct->proto.tcp.seen[0].td_end =
+			segment_seq_plus_len(ntohl(th->seq), skb->len,
+					     dataoff, th);
+		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+		if (ct->proto.tcp.seen[0].td_maxwin == 0)
+			ct->proto.tcp.seen[0].td_maxwin = 1;
+		ct->proto.tcp.seen[0].td_maxend =
+			ct->proto.tcp.seen[0].td_end +
+			ct->proto.tcp.seen[0].td_maxwin;
+
+		/* We assume SACK and liberal window checking to handle
+		 * window scaling */
+		ct->proto.tcp.seen[0].flags =
+		ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
+					      IP_CT_TCP_FLAG_BE_LIBERAL;
+	}
+
+	/* tcp_packet will set them */
+	ct->proto.tcp.last_index = TCP_NONE_SET;
+
+	pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
+		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		 __func__,
+		 sender->td_end, sender->td_maxend, sender->td_maxwin,
+		 sender->td_scale,
+		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+		 receiver->td_scale);
+	return true;
+}
+
 /* Returns verdict for packet, or -1 for invalid. */
 static int tcp_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
@@ -788,7 +860,11 @@ static int tcp_packet(struct nf_conn *ct,
 	unsigned long timeout;
 
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
-	BUG_ON(th == NULL);
+	if (th == NULL)
+		return -NF_ACCEPT;
+
+	if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
+		return -NF_ACCEPT;
 
 	spin_lock_bh(&ct->lock);
 	old_state = ct->proto.tcp.state;
@@ -1069,82 +1145,6 @@ static int tcp_packet(struct nf_conn *ct,
 	return NF_ACCEPT;
 }
 
-/* Called when a new connection for this protocol found. */
-static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
-		    unsigned int dataoff)
-{
-	enum tcp_conntrack new_state;
-	const struct tcphdr *th;
-	struct tcphdr _tcph;
-	struct net *net = nf_ct_net(ct);
-	struct nf_tcp_net *tn = tcp_pernet(net);
-	const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
-	const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
-
-	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
-	BUG_ON(th == NULL);
-
-	/* Don't need lock here: this conntrack not in circulation yet */
-	new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
-
-	/* Invalid: delete conntrack */
-	if (new_state >= TCP_CONNTRACK_MAX) {
-		pr_debug("nf_ct_tcp: invalid new deleting.\n");
-		return false;
-	}
-
-	if (new_state == TCP_CONNTRACK_SYN_SENT) {
-		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
-		/* SYN packet */
-		ct->proto.tcp.seen[0].td_end =
-			segment_seq_plus_len(ntohl(th->seq), skb->len,
-					     dataoff, th);
-		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
-		if (ct->proto.tcp.seen[0].td_maxwin == 0)
-			ct->proto.tcp.seen[0].td_maxwin = 1;
-		ct->proto.tcp.seen[0].td_maxend =
-			ct->proto.tcp.seen[0].td_end;
-
-		tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
-	} else if (tn->tcp_loose == 0) {
-		/* Don't try to pick up connections. */
-		return false;
-	} else {
-		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
-		/*
-		 * We are in the middle of a connection,
-		 * its history is lost for us.
-		 * Let's try to use the data from the packet.
-		 */
-		ct->proto.tcp.seen[0].td_end =
-			segment_seq_plus_len(ntohl(th->seq), skb->len,
-					     dataoff, th);
-		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
-		if (ct->proto.tcp.seen[0].td_maxwin == 0)
-			ct->proto.tcp.seen[0].td_maxwin = 1;
-		ct->proto.tcp.seen[0].td_maxend =
-			ct->proto.tcp.seen[0].td_end +
-			ct->proto.tcp.seen[0].td_maxwin;
-
-		/* We assume SACK and liberal window checking to handle
-		 * window scaling */
-		ct->proto.tcp.seen[0].flags =
-		ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
-					      IP_CT_TCP_FLAG_BE_LIBERAL;
-	}
-
-	/* tcp_packet will set them */
-	ct->proto.tcp.last_index = TCP_NONE_SET;
-
-	pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
-		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
-		 sender->td_end, sender->td_maxend, sender->td_maxwin,
-		 sender->td_scale,
-		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
-		 receiver->td_scale);
-	return true;
-}
-
 static bool tcp_can_early_drop(const struct nf_conn *ct)
 {
 	switch (ct->proto.tcp.state) {
@@ -1548,7 +1548,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
 	.print_conntrack 	= tcp_print_conntrack,
 #endif
 	.packet 		= tcp_packet,
-	.new 			= tcp_new,
 	.error			= tcp_error,
 	.can_early_drop		= tcp_can_early_drop,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -1583,7 +1582,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
 	.print_conntrack 	= tcp_print_conntrack,
 #endif
 	.packet 		= tcp_packet,
-	.new 			= tcp_new,
 	.error			= tcp_error,
 	.can_early_drop		= tcp_can_early_drop,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index bf59d32bba98..1119323425e7 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -70,13 +70,6 @@ static int udp_packet(struct nf_conn *ct,
 	return NF_ACCEPT;
 }
 
-/* Called when a new connection for this protocol found. */
-static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
-		    unsigned int dataoff)
-{
-	return true;
-}
-
 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
 static void udplite_error_log(const struct sk_buff *skb,
 			      const struct nf_hook_state *state,
@@ -288,7 +281,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
 	.l4proto		= IPPROTO_UDP,
 	.allow_clash		= true,
 	.packet			= udp_packet,
-	.new			= udp_new,
 	.error			= udp_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
@@ -317,7 +309,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
 	.l4proto		= IPPROTO_UDPLITE,
 	.allow_clash		= true,
 	.packet			= udp_packet,
-	.new			= udp_new,
 	.error			= udplite_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
@@ -346,7 +337,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
 	.l4proto		= IPPROTO_UDP,
 	.allow_clash		= true,
 	.packet			= udp_packet,
-	.new			= udp_new,
 	.error			= udp_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
@@ -375,7 +365,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
 	.l4proto		= IPPROTO_UDPLITE,
 	.allow_clash		= true,
 	.packet			= udp_packet,
-	.new			= udp_new,
 	.error			= udplite_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
-- 
cgit v1.2.3


From 83d213fd9d1a56108584cd812333462caa39a747 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 12 Sep 2018 15:19:09 +0200
Subject: netfilter: conntrack: deconstify packet callback skb pointer

Only two protocols need the ->error() function: icmp and icmpv6.
This is because icmp error mssages might be RELATED to an existing
connection (e.g. PMTUD, port unreachable and the like), and their
->error() handlers do this.

The error callback is already optional, so remove it for
udp and call them from ->packet() instead.

As the error() callback can call checksum functions that write to
skb->csum*, the const qualifier has to be removed as well.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h |   2 +-
 net/netfilter/nf_conntrack_proto_dccp.c      |   2 +-
 net/netfilter/nf_conntrack_proto_generic.c   |   2 +-
 net/netfilter/nf_conntrack_proto_gre.c       |   2 +-
 net/netfilter/nf_conntrack_proto_icmp.c      |   2 +-
 net/netfilter/nf_conntrack_proto_icmpv6.c    |   8 +-
 net/netfilter/nf_conntrack_proto_sctp.c      |   2 +-
 net/netfilter/nf_conntrack_proto_tcp.c       |   2 +-
 net/netfilter/nf_conntrack_proto_udp.c       | 137 ++++++++++++++++-----------
 9 files changed, 95 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 016958e67fcc..39f0c84f71b9 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -43,7 +43,7 @@ struct nf_conntrack_l4proto {
 
 	/* Returns verdict for packet, or -1 for invalid. */
 	int (*packet)(struct nf_conn *ct,
-		      const struct sk_buff *skb,
+		      struct sk_buff *skb,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
 		      const struct nf_hook_state *state);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index e7b5449ea883..fdea305c7aa5 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -435,7 +435,7 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh)
 		     ntohl(dhack->dccph_ack_nr_low);
 }
 
-static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
+static int dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
 		       unsigned int dataoff, enum ip_conntrack_info ctinfo,
 		       const struct nf_hook_state *state)
 {
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index deeb05c50f02..fea952518d0d 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -44,7 +44,7 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb,
 
 /* Returns verdict for packet, or -1 for invalid. */
 static int generic_packet(struct nf_conn *ct,
-			  const struct sk_buff *skb,
+			  struct sk_buff *skb,
 			  unsigned int dataoff,
 			  enum ip_conntrack_info ctinfo,
 			  const struct nf_hook_state *state)
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index a44bbee271cb..0348aa98950a 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -233,7 +233,7 @@ static unsigned int *gre_get_timeouts(struct net *net)
 
 /* Returns verdict for packet, and may modify conntrack */
 static int gre_packet(struct nf_conn *ct,
-		      const struct sk_buff *skb,
+		      struct sk_buff *skb,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
 		      const struct nf_hook_state *state)
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 19ef0c41602b..a2ca3a739aa3 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -74,7 +74,7 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
 
 /* Returns verdict for packet, or -1 for invalid. */
 static int icmp_packet(struct nf_conn *ct,
-		       const struct sk_buff *skb,
+		       struct sk_buff *skb,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
 		       const struct nf_hook_state *state)
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index bb94363818e6..a1933566d53d 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -92,10 +92,10 @@ static unsigned int *icmpv6_get_timeouts(struct net *net)
 
 /* Returns verdict for packet, or -1 for invalid. */
 static int icmpv6_packet(struct nf_conn *ct,
-		       const struct sk_buff *skb,
-		       unsigned int dataoff,
-		       enum ip_conntrack_info ctinfo,
-		       const struct nf_hook_state *state)
+		         struct sk_buff *skb,
+		         unsigned int dataoff,
+		         enum ip_conntrack_info ctinfo,
+		         const struct nf_hook_state *state)
 {
 	unsigned int *timeout = nf_ct_timeout_lookup(ct);
 	static const u8 valid_new[] = {
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 78c115152a78..ea16c1c58483 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -332,7 +332,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
 
 /* Returns verdict for packet, or -NF_ACCEPT for invalid. */
 static int sctp_packet(struct nf_conn *ct,
-		       const struct sk_buff *skb,
+		       struct sk_buff *skb,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
 		       const struct nf_hook_state *state)
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 6d278cdff145..0c3e1f2f9013 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -844,7 +844,7 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 
 /* Returns verdict for packet, or -1 for invalid. */
 static int tcp_packet(struct nf_conn *ct,
-		      const struct sk_buff *skb,
+		      struct sk_buff *skb,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
 		      const struct nf_hook_state *state)
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 1119323425e7..da94c967c835 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -42,15 +42,65 @@ static unsigned int *udp_get_timeouts(struct net *net)
 	return udp_pernet(net)->timeouts;
 }
 
+static void udp_error_log(const struct sk_buff *skb,
+			  const struct nf_hook_state *state,
+			  const char *msg)
+{
+	nf_l4proto_log_invalid(skb, state->net, state->pf,
+			       IPPROTO_UDP, "%s", msg);
+}
+
+static bool udp_error(struct sk_buff *skb,
+		      unsigned int dataoff,
+		      const struct nf_hook_state *state)
+{
+	unsigned int udplen = skb->len - dataoff;
+	const struct udphdr *hdr;
+	struct udphdr _hdr;
+
+	/* Header is too small? */
+	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (!hdr) {
+		udp_error_log(skb, state, "short packet");
+		return true;
+	}
+
+	/* Truncated/malformed packets */
+	if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
+		udp_error_log(skb, state, "truncated/malformed packet");
+		return true;
+	}
+
+	/* Packet with no checksum */
+	if (!hdr->check)
+		return false;
+
+	/* Checksum invalid? Ignore.
+	 * We skip checking packets on the outgoing path
+	 * because the checksum is assumed to be correct.
+	 * FIXME: Source route IP option packets --RR */
+	if (state->hook == NF_INET_PRE_ROUTING &&
+	    state->net->ct.sysctl_checksum &&
+	    nf_checksum(skb, state->hook, dataoff, IPPROTO_UDP, state->pf)) {
+		udp_error_log(skb, state, "bad checksum");
+		return true;
+	}
+
+	return false;
+}
+
 /* Returns verdict for packet, and may modify conntracktype */
 static int udp_packet(struct nf_conn *ct,
-		      const struct sk_buff *skb,
+		      struct sk_buff *skb,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
 		      const struct nf_hook_state *state)
 {
 	unsigned int *timeouts;
 
+	if (udp_error(skb, dataoff, state))
+		return -NF_ACCEPT;
+
 	timeouts = nf_ct_timeout_lookup(ct);
 	if (!timeouts)
 		timeouts = udp_get_timeouts(nf_ct_net(ct));
@@ -79,9 +129,9 @@ static void udplite_error_log(const struct sk_buff *skb,
 			       IPPROTO_UDPLITE, "%s", msg);
 }
 
-static int udplite_error(struct nf_conn *tmpl, struct sk_buff *skb,
-			 unsigned int dataoff,
-			 const struct nf_hook_state *state)
+static bool udplite_error(struct sk_buff *skb,
+			  unsigned int dataoff,
+			  const struct nf_hook_state *state)
 {
 	unsigned int udplen = skb->len - dataoff;
 	const struct udphdr *hdr;
@@ -92,7 +142,7 @@ static int udplite_error(struct nf_conn *tmpl, struct sk_buff *skb,
 	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
 	if (!hdr) {
 		udplite_error_log(skb, state, "short packet");
-		return -NF_ACCEPT;
+		return true;
 	}
 
 	cscov = ntohs(hdr->len);
@@ -100,13 +150,13 @@ static int udplite_error(struct nf_conn *tmpl, struct sk_buff *skb,
 		cscov = udplen;
 	} else if (cscov < sizeof(*hdr) || cscov > udplen) {
 		udplite_error_log(skb, state, "invalid checksum coverage");
-		return -NF_ACCEPT;
+		return true;
 	}
 
 	/* UDPLITE mandates checksums */
 	if (!hdr->check) {
 		udplite_error_log(skb, state, "checksum missing");
-		return -NF_ACCEPT;
+		return true;
 	}
 
 	/* Checksum invalid? Ignore. */
@@ -115,58 +165,43 @@ static int udplite_error(struct nf_conn *tmpl, struct sk_buff *skb,
 	    nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_UDP,
 				state->pf)) {
 		udplite_error_log(skb, state, "bad checksum");
-		return -NF_ACCEPT;
+		return true;
 	}
 
-	return NF_ACCEPT;
+	return false;
 }
-#endif
 
-static void udp_error_log(const struct sk_buff *skb,
-			  const struct nf_hook_state *state,
-			  const char *msg)
-{
-	nf_l4proto_log_invalid(skb, state->net, state->pf,
-			       IPPROTO_UDP, "%s", msg);
-}
-
-static int udp_error(struct nf_conn *tmpl, struct sk_buff *skb,
-		     unsigned int dataoff,
-		     const struct nf_hook_state *state)
+/* Returns verdict for packet, and may modify conntracktype */
+static int udplite_packet(struct nf_conn *ct,
+			  struct sk_buff *skb,
+			  unsigned int dataoff,
+			  enum ip_conntrack_info ctinfo,
+			  const struct nf_hook_state *state)
 {
-	unsigned int udplen = skb->len - dataoff;
-	const struct udphdr *hdr;
-	struct udphdr _hdr;
-
-	/* Header is too small? */
-	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
-	if (hdr == NULL) {
-		udp_error_log(skb, state, "short packet");
-		return -NF_ACCEPT;
-	}
+	unsigned int *timeouts;
 
-	/* Truncated/malformed packets */
-	if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
-		udp_error_log(skb, state, "truncated/malformed packet");
+	if (udplite_error(skb, dataoff, state))
 		return -NF_ACCEPT;
-	}
 
-	/* Packet with no checksum */
-	if (!hdr->check)
-		return NF_ACCEPT;
+	timeouts = nf_ct_timeout_lookup(ct);
+	if (!timeouts)
+		timeouts = udp_get_timeouts(nf_ct_net(ct));
 
-	/* Checksum invalid? Ignore.
-	 * We skip checking packets on the outgoing path
-	 * because the checksum is assumed to be correct.
-	 * FIXME: Source route IP option packets --RR */
-	if (state->net->ct.sysctl_checksum && state->hook == NF_INET_PRE_ROUTING &&
-	    nf_checksum(skb, state->hook, dataoff, IPPROTO_UDP, state->pf)) {
-		udp_error_log(skb, state, "bad checksum");
-		return -NF_ACCEPT;
+	/* If we've seen traffic both ways, this is some kind of UDP
+	   stream.  Extend timeout. */
+	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+		nf_ct_refresh_acct(ct, ctinfo, skb,
+				   timeouts[UDP_CT_REPLIED]);
+		/* Also, more likely to be important, and not a probe */
+		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+			nf_conntrack_event_cache(IPCT_ASSURED, ct);
+	} else {
+		nf_ct_refresh_acct(ct, ctinfo, skb,
+				   timeouts[UDP_CT_UNREPLIED]);
 	}
-
 	return NF_ACCEPT;
 }
+#endif
 
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 
@@ -281,7 +316,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
 	.l4proto		= IPPROTO_UDP,
 	.allow_clash		= true,
 	.packet			= udp_packet,
-	.error			= udp_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
@@ -308,8 +342,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
 	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_UDPLITE,
 	.allow_clash		= true,
-	.packet			= udp_packet,
-	.error			= udplite_error,
+	.packet			= udplite_packet,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
@@ -337,7 +370,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
 	.l4proto		= IPPROTO_UDP,
 	.allow_clash		= true,
 	.packet			= udp_packet,
-	.error			= udp_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
@@ -364,8 +396,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
 	.l3proto		= PF_INET6,
 	.l4proto		= IPPROTO_UDPLITE,
 	.allow_clash		= true,
-	.packet			= udp_packet,
-	.error			= udplite_error,
+	.packet			= udplite_packet,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-- 
cgit v1.2.3


From 6fe78fa484a5dad030b24e33e0cedc5d5bbd0fde Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 12 Sep 2018 15:19:11 +0200
Subject: netfilter: conntrack: remove error callback and handle icmp from core

icmp(v6) are the only two layer four protocols that need the error()
callback (to handle icmp errors that are related to an established
connections, e.g. packet too big, port unreachable and the like).

Remove the error callback and handle these two special cases from the core.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h | 13 ++++++---
 net/netfilter/nf_conntrack_core.c            | 43 +++++++++++++++++++++++-----
 net/netfilter/nf_conntrack_proto_icmp.c      |  8 ++----
 net/netfilter/nf_conntrack_proto_icmpv6.c    | 10 +++----
 4 files changed, 52 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 39f0c84f71b9..7fdb4b95bba4 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -51,10 +51,6 @@ struct nf_conntrack_l4proto {
 	/* Called when a conntrack entry is destroyed */
 	void (*destroy)(struct nf_conn *ct);
 
-	int (*error)(struct nf_conn *tmpl, struct sk_buff *skb,
-		     unsigned int dataoff,
-		     const struct nf_hook_state *state);
-
 	/* called by gc worker if table is full */
 	bool (*can_early_drop)(const struct nf_conn *ct);
 
@@ -97,6 +93,15 @@ struct nf_conntrack_l4proto {
 	struct module *me;
 };
 
+int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
+			      struct sk_buff *skb,
+			      unsigned int dataoff,
+			      const struct nf_hook_state *state);
+
+int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
+			      struct sk_buff *skb,
+			      unsigned int dataoff,
+			      const struct nf_hook_state *state);
 /* Existing built-in generic protocol */
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index dccc96e94d7c..087bf63826fb 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1486,6 +1486,39 @@ resolve_normal_ct(struct nf_conn *tmpl,
 	return 0;
 }
 
+/*
+ * icmp packets need special treatment to handle error messages that are
+ * related to a connection.
+ *
+ * Callers need to check if skb has a conntrack assigned when this
+ * helper returns; in such case skb belongs to an already known connection.
+ */
+static unsigned int __cold
+nf_conntrack_handle_icmp(struct nf_conn *tmpl,
+			 struct sk_buff *skb,
+			 unsigned int dataoff,
+			 u8 protonum,
+			 const struct nf_hook_state *state)
+{
+	int ret;
+
+	if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
+		ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
+		ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
+#endif
+	else
+		return NF_ACCEPT;
+
+	if (ret <= 0) {
+		NF_CT_STAT_INC_ATOMIC(state->net, error);
+		NF_CT_STAT_INC_ATOMIC(state->net, invalid);
+	}
+
+	return ret;
+}
+
 unsigned int
 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
 {
@@ -1518,14 +1551,10 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
 
 	l4proto = __nf_ct_l4proto_find(state->pf, protonum);
 
-	/* It may be an special packet, error, unclean...
-	 * inverse of the return code tells to the netfilter
-	 * core what to do with the packet. */
-	if (l4proto->error != NULL) {
-		ret = l4proto->error(tmpl, skb, dataoff, state);
+	if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
+		ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
+					       protonum, state);
 		if (ret <= 0) {
-			NF_CT_STAT_INC_ATOMIC(state->net, error);
-			NF_CT_STAT_INC_ATOMIC(state->net, invalid);
 			ret = -ret;
 			goto out;
 		}
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index a2ca3a739aa3..2c981622b674 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -165,10 +165,9 @@ static void icmp_error_log(const struct sk_buff *skb,
 }
 
 /* Small and modified version of icmp_rcv */
-static int
-icmp_error(struct nf_conn *tmpl,
-	   struct sk_buff *skb, unsigned int dataoff,
-	   const struct nf_hook_state *state)
+int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
+			      struct sk_buff *skb, unsigned int dataoff,
+			      const struct nf_hook_state *state)
 {
 	const struct icmphdr *icmph;
 	struct icmphdr _ih;
@@ -355,7 +354,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
 	.pkt_to_tuple		= icmp_pkt_to_tuple,
 	.invert_tuple		= icmp_invert_tuple,
 	.packet			= icmp_packet,
-	.error			= icmp_error,
 	.destroy		= NULL,
 	.me			= NULL,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index a1933566d53d..effac451c7e0 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -184,11 +184,10 @@ static void icmpv6_error_log(const struct sk_buff *skb,
 			       IPPROTO_ICMPV6, "%s", msg);
 }
 
-static int
-icmpv6_error(struct nf_conn *tmpl,
-	     struct sk_buff *skb,
-	     unsigned int dataoff,
-	     const struct nf_hook_state *state)
+int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
+			      struct sk_buff *skb,
+			      unsigned int dataoff,
+			      const struct nf_hook_state *state)
 {
 	const struct icmp6hdr *icmp6h;
 	struct icmp6hdr _ih;
@@ -366,7 +365,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
 	.pkt_to_tuple		= icmpv6_pkt_to_tuple,
 	.invert_tuple		= icmpv6_invert_tuple,
 	.packet			= icmpv6_packet,
-	.error			= icmpv6_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= icmpv6_tuple_to_nlattr,
 	.nlattr_tuple_size	= icmpv6_nlattr_tuple_size,
-- 
cgit v1.2.3


From ca2ca6e1c04e64413f5fb9a5d54fb8b0bdd86467 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 12 Sep 2018 15:19:12 +0200
Subject: netfilter: conntrack: remove unused proto arg from netns init
 functions

Its unused, next patch will remove l4proto->l3proto number to simplify
l4 protocol demuxer lookup.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h | 2 +-
 net/netfilter/nf_conntrack_proto.c           | 5 ++---
 net/netfilter/nf_conntrack_proto_dccp.c      | 2 +-
 net/netfilter/nf_conntrack_proto_generic.c   | 2 +-
 net/netfilter/nf_conntrack_proto_gre.c       | 2 +-
 net/netfilter/nf_conntrack_proto_icmp.c      | 2 +-
 net/netfilter/nf_conntrack_proto_icmpv6.c    | 2 +-
 net/netfilter/nf_conntrack_proto_sctp.c      | 2 +-
 net/netfilter/nf_conntrack_proto_tcp.c       | 2 +-
 net/netfilter/nf_conntrack_proto_udp.c       | 2 +-
 10 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 7fdb4b95bba4..420823a8648f 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -84,7 +84,7 @@ struct nf_conntrack_l4proto {
 #endif
 	unsigned int	*net_id;
 	/* Init l4proto pernet data */
-	int (*init_net)(struct net *net, u_int16_t proto);
+	int (*init_net)(struct net *net);
 
 	/* Return the per-net protocol part. */
 	struct nf_proto_net *(*get_net_proto)(struct net *net);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 4896ba44becb..06a182a23d92 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -274,7 +274,7 @@ int nf_ct_l4proto_pernet_register_one(struct net *net,
 	struct nf_proto_net *pn = NULL;
 
 	if (l4proto->init_net) {
-		ret = l4proto->init_net(net, l4proto->l3proto);
+		ret = l4proto->init_net(net);
 		if (ret < 0)
 			goto out;
 	}
@@ -988,8 +988,7 @@ int nf_conntrack_proto_pernet_init(struct net *net)
 	struct nf_proto_net *pn = nf_ct_l4proto_net(net,
 					&nf_conntrack_l4proto_generic);
 
-	err = nf_conntrack_l4proto_generic.init_net(net,
-					nf_conntrack_l4proto_generic.l3proto);
+	err = nf_conntrack_l4proto_generic.init_net(net);
 	if (err < 0)
 		return err;
 	err = nf_ct_l4proto_register_sysctl(net,
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 1b9e600f707d..d22852ae2316 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -812,7 +812,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
 	return 0;
 }
 
-static int dccp_init_net(struct net *net, u_int16_t proto)
+static int dccp_init_net(struct net *net)
 {
 	struct nf_dccp_net *dn = dccp_pernet(net);
 	struct nf_proto_net *pn = &dn->pn;
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index fea952518d0d..4530f76b51bd 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -136,7 +136,7 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
 	return 0;
 }
 
-static int generic_init_net(struct net *net, u_int16_t proto)
+static int generic_init_net(struct net *net)
 {
 	struct nf_generic_net *gn = generic_pernet(net);
 	struct nf_proto_net *pn = &gn->pn;
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 0348aa98950a..810039dcd779 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -329,7 +329,7 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
 };
 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
-static int gre_init_net(struct net *net, u_int16_t proto)
+static int gre_init_net(struct net *net)
 {
 	struct netns_proto_gre *net_gre = gre_pernet(net);
 	int i;
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 2c981622b674..a9d2769f72d8 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -332,7 +332,7 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
 	return 0;
 }
 
-static int icmp_init_net(struct net *net, u_int16_t proto)
+static int icmp_init_net(struct net *net)
 {
 	struct nf_icmp_net *in = icmp_pernet(net);
 	struct nf_proto_net *pn = &in->pn;
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index effac451c7e0..8a88db599d66 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -343,7 +343,7 @@ static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn,
 	return 0;
 }
 
-static int icmpv6_init_net(struct net *net, u_int16_t proto)
+static int icmpv6_init_net(struct net *net)
 {
 	struct nf_icmp_net *in = icmpv6_pernet(net);
 	struct nf_proto_net *pn = &in->pn;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 7d7eb18f658b..9cf59ab280c0 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -734,7 +734,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
 	return 0;
 }
 
-static int sctp_init_net(struct net *net, u_int16_t proto)
+static int sctp_init_net(struct net *net)
 {
 	struct nf_sctp_net *sn = sctp_pernet(net);
 	struct nf_proto_net *pn = &sn->pn;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 14a1a9348fcc..f4954fa7be25 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1506,7 +1506,7 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
 	return 0;
 }
 
-static int tcp_init_net(struct net *net, u_int16_t proto)
+static int tcp_init_net(struct net *net)
 {
 	struct nf_tcp_net *tn = tcp_pernet(net);
 	struct nf_proto_net *pn = &tn->pn;
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index da94c967c835..4645bf5b20c8 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -290,7 +290,7 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
 	return 0;
 }
 
-static int udp_init_net(struct net *net, u_int16_t proto)
+static int udp_init_net(struct net *net)
 {
 	struct nf_udp_net *un = udp_pernet(net);
 	struct nf_proto_net *pn = &un->pn;
-- 
cgit v1.2.3


From dd2934a95701576203b2f61e8ded4e4a2f9183ea Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 17 Sep 2018 12:02:54 +0200
Subject: netfilter: conntrack: remove l3->l4 mapping information

l4 protocols are demuxed by l3num, l4num pair.

However, almost all l4 trackers are l3 agnostic.

Only exceptions are:
 - gre, icmp (ipv4 only)
 - icmpv6 (ipv6 only)

This commit gets rid of the l3 mapping, l4 trackers can now be looked up
by their IPPROTO_XXX value alone, which gets rid of the additional l3
indirection.

For icmp, ipcmp6 and gre, add a check on state->pf and
return -NF_ACCEPT in case we're asked to track e.g. icmpv6-in-ipv4,
this seems more fitting than using the generic tracker.

Additionally we can kill the 2nd l4proto definitions that were needed
for v4/v6 split -- they are now the same so we can use single l4proto
struct for each protocol, rather than two.

The EXPORT_SYMBOLs can be removed as all these object files are
part of nf_conntrack with no external references.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h |  13 ++--
 include/net/netfilter/ipv6/nf_conntrack_ipv6.h |  13 ----
 include/net/netfilter/nf_conntrack_l4proto.h   |   9 +--
 net/netfilter/nf_conntrack_core.c              |  15 ++--
 net/netfilter/nf_conntrack_expect.c            |   3 +-
 net/netfilter/nf_conntrack_netlink.c           |  14 ++--
 net/netfilter/nf_conntrack_proto.c             | 104 +++++++------------------
 net/netfilter/nf_conntrack_proto_dccp.c        |  35 +--------
 net/netfilter/nf_conntrack_proto_generic.c     |   1 -
 net/netfilter/nf_conntrack_proto_gre.c         |   4 +-
 net/netfilter/nf_conntrack_proto_icmp.c        |   6 +-
 net/netfilter/nf_conntrack_proto_icmpv6.c      |   6 +-
 net/netfilter/nf_conntrack_proto_sctp.c        |  36 +--------
 net/netfilter/nf_conntrack_proto_tcp.c         |  37 +--------
 net/netfilter/nf_conntrack_proto_udp.c         |  62 +--------------
 net/netfilter/nf_conntrack_standalone.c        |   2 +-
 net/netfilter/nf_flow_table_core.c             |   2 +-
 net/netfilter/nfnetlink_cttimeout.c            |  11 +--
 net/netfilter/nft_ct.c                         |   2 +-
 net/netfilter/xt_CT.c                          |   2 +-
 20 files changed, 76 insertions(+), 301 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index c84b51682f08..135ee702c7b0 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -10,20 +10,17 @@
 #ifndef _NF_CONNTRACK_IPV4_H
 #define _NF_CONNTRACK_IPV4_H
 
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4;
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp;
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
 #ifdef CONFIG_NF_CT_PROTO_DCCP
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp;
 #endif
 #ifdef CONFIG_NF_CT_PROTO_SCTP
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp;
 #endif
 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4;
+extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite;
 #endif
 
-int nf_conntrack_ipv4_compat_init(void);
-void nf_conntrack_ipv4_compat_fini(void);
-
 #endif /*_NF_CONNTRACK_IPV4_H*/
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index effa8dfba68c..7b3c873f8839 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -2,20 +2,7 @@
 #ifndef _NF_CONNTRACK_IPV6_H
 #define _NF_CONNTRACK_IPV6_H
 
-extern const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6;
-
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6;
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6;
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6;
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6;
-#endif
-#ifdef CONFIG_NF_CT_PROTO_SCTP
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6;
-#endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6;
-#endif
 
 #include <linux/sysctl.h>
 extern struct ctl_table nf_ct_ipv6_sysctl_table[];
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 420823a8648f..d838a93430a1 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -18,9 +18,6 @@
 struct seq_file;
 
 struct nf_conntrack_l4proto {
-	/* L3 Protocol number. */
-	u_int16_t l3proto;
-
 	/* L4 Protocol number. */
 	u_int8_t l4proto;
 
@@ -107,11 +104,9 @@ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
 
 #define MAX_NF_CT_PROTO 256
 
-const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u_int16_t l3proto,
-						  u_int8_t l4proto);
+const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u8 l4proto);
 
-const struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u_int16_t l3proto,
-						    u_int8_t l4proto);
+const struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u8 l4proto);
 void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p);
 
 /* Protocol pernet registration. */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 087bf63826fb..ca1168d67fac 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -379,7 +379,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 		return false;
 	}
 
-	l4proto = __nf_ct_l4proto_find(l3num, protonum);
+	l4proto = __nf_ct_l4proto_find(protonum);
 
 	ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
 			      l4proto);
@@ -539,7 +539,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 		nf_ct_tmpl_free(ct);
 		return;
 	}
-	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
 	if (l4proto->destroy)
 		l4proto->destroy(ct);
 
@@ -840,7 +840,7 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
 	enum ip_conntrack_info oldinfo;
 	struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
 
-	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
 	if (l4proto->allow_clash &&
 	    !nf_ct_is_dying(ct) &&
 	    atomic_inc_not_zero(&ct->ct_general.use)) {
@@ -1109,7 +1109,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 	if (!test_bit(IPS_ASSURED_BIT, &ct->status))
 		return true;
 
-	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
 	if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
 		return true;
 
@@ -1549,7 +1549,7 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
 		goto out;
 	}
 
-	l4proto = __nf_ct_l4proto_find(state->pf, protonum);
+	l4proto = __nf_ct_l4proto_find(protonum);
 
 	if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
 		ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
@@ -1618,8 +1618,7 @@ bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
 
 	rcu_read_lock();
 	ret = nf_ct_invert_tuple(inverse, orig,
-				 __nf_ct_l4proto_find(orig->src.l3num,
-						      orig->dst.protonum));
+				 __nf_ct_l4proto_find(orig->dst.protonum));
 	rcu_read_unlock();
 	return ret;
 }
@@ -1776,7 +1775,7 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
 	if (dataoff <= 0)
 		return -1;
 
-	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+	l4proto = nf_ct_l4proto_find_get(l4num);
 
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
 			     l4num, net, &tuple, l4proto))
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 27b84231db10..3034038bfdf0 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -610,8 +610,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
 		   expect->tuple.src.l3num,
 		   expect->tuple.dst.protonum);
 	print_tuple(s, &expect->tuple,
-		    __nf_ct_l4proto_find(expect->tuple.src.l3num,
-				       expect->tuple.dst.protonum));
+		    __nf_ct_l4proto_find(expect->tuple.dst.protonum));
 
 	if (expect->flags & NF_CT_EXPECT_PERMANENT) {
 		seq_puts(s, "PERMANENT");
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index f8c74f31aa36..099a450e26be 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -135,8 +135,7 @@ static int ctnetlink_dump_tuples(struct sk_buff *skb,
 	ret = ctnetlink_dump_tuples_ip(skb, tuple);
 
 	if (ret >= 0) {
-		l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
-					       tuple->dst.protonum);
+		l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
 		ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto);
 	}
 	rcu_read_unlock();
@@ -184,7 +183,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
 	struct nlattr *nest_proto;
 	int ret;
 
-	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
 	if (!l4proto->to_nlattr)
 		return 0;
 
@@ -592,7 +591,7 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct)
 	len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1);
 	len *= 3u; /* ORIG, REPLY, MASTER */
 
-	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
 	len += l4proto->nlattr_size;
 	if (l4proto->nlattr_tuple_size) {
 		len4 = l4proto->nlattr_tuple_size();
@@ -1054,7 +1053,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
 	tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]);
 
 	rcu_read_lock();
-	l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum);
+	l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
 
 	if (likely(l4proto->nlattr_to_tuple)) {
 		ret = nla_validate_nested(attr, CTA_PROTO_MAX,
@@ -1702,7 +1701,7 @@ static int ctnetlink_change_protoinfo(struct nf_conn *ct,
 		return err;
 
 	rcu_read_lock();
-	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
 	if (l4proto->from_nlattr)
 		err = l4proto->from_nlattr(tb, ct);
 	rcu_read_unlock();
@@ -2662,8 +2661,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
 	rcu_read_lock();
 	ret = ctnetlink_dump_tuples_ip(skb, &m);
 	if (ret >= 0) {
-		l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
-					       tuple->dst.protonum);
+		l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
 	ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto);
 	}
 	rcu_read_unlock();
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 06a182a23d92..69d7170cfa8c 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -43,7 +43,7 @@
 
 extern unsigned int nf_conntrack_net_id;
 
-static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly;
+static struct nf_conntrack_l4proto __rcu *nf_ct_protos[MAX_NF_CT_PROTO] __read_mostly;
 
 static DEFINE_MUTEX(nf_ct_proto_mutex);
 
@@ -124,23 +124,21 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid);
 #endif
 
-const struct nf_conntrack_l4proto *
-__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto)
+const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u8 l4proto)
 {
-	if (unlikely(l3proto >= NFPROTO_NUMPROTO || nf_ct_protos[l3proto] == NULL))
+	if (unlikely(l4proto >= ARRAY_SIZE(nf_ct_protos)))
 		return &nf_conntrack_l4proto_generic;
 
-	return rcu_dereference(nf_ct_protos[l3proto][l4proto]);
+	return rcu_dereference(nf_ct_protos[l4proto]);
 }
 EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
 
-const struct nf_conntrack_l4proto *
-nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
+const struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u8 l4num)
 {
 	const struct nf_conntrack_l4proto *p;
 
 	rcu_read_lock();
-	p = __nf_ct_l4proto_find(l3num, l4num);
+	p = __nf_ct_l4proto_find(l4num);
 	if (!try_module_get(p->me))
 		p = &nf_conntrack_l4proto_generic;
 	rcu_read_unlock();
@@ -159,8 +157,7 @@ static int kill_l4proto(struct nf_conn *i, void *data)
 {
 	const struct nf_conntrack_l4proto *l4proto;
 	l4proto = data;
-	return nf_ct_protonum(i) == l4proto->l4proto &&
-	       nf_ct_l3num(i) == l4proto->l3proto;
+	return nf_ct_protonum(i) == l4proto->l4proto;
 }
 
 static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
@@ -219,48 +216,20 @@ int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *l4proto)
 {
 	int ret = 0;
 
-	if (l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos))
-		return -EBUSY;
-
 	if ((l4proto->to_nlattr && l4proto->nlattr_size == 0) ||
 	    (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
 		return -EINVAL;
 
 	mutex_lock(&nf_ct_proto_mutex);
-	if (!nf_ct_protos[l4proto->l3proto]) {
-		/* l3proto may be loaded latter. */
-		struct nf_conntrack_l4proto __rcu **proto_array;
-		int i;
-
-		proto_array =
-			kmalloc_array(MAX_NF_CT_PROTO,
-				      sizeof(struct nf_conntrack_l4proto *),
-				      GFP_KERNEL);
-		if (proto_array == NULL) {
-			ret = -ENOMEM;
-			goto out_unlock;
-		}
-
-		for (i = 0; i < MAX_NF_CT_PROTO; i++)
-			RCU_INIT_POINTER(proto_array[i],
-					 &nf_conntrack_l4proto_generic);
-
-		/* Before making proto_array visible to lockless readers,
-		 * we must make sure its content is committed to memory.
-		 */
-		smp_wmb();
-
-		nf_ct_protos[l4proto->l3proto] = proto_array;
-	} else if (rcu_dereference_protected(
-			nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+	if (rcu_dereference_protected(
+			nf_ct_protos[l4proto->l4proto],
 			lockdep_is_held(&nf_ct_proto_mutex)
 			) != &nf_conntrack_l4proto_generic) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
 
-	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
-			   l4proto);
+	rcu_assign_pointer(nf_ct_protos[l4proto->l4proto], l4proto);
 out_unlock:
 	mutex_unlock(&nf_ct_proto_mutex);
 	return ret;
@@ -296,13 +265,13 @@ EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one);
 static void __nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto)
 
 {
-	BUG_ON(l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos));
+	BUG_ON(l4proto->l4proto >= ARRAY_SIZE(nf_ct_protos));
 
 	BUG_ON(rcu_dereference_protected(
-			nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+			nf_ct_protos[l4proto->l4proto],
 			lockdep_is_held(&nf_ct_proto_mutex)
 			) != l4proto);
-	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+	rcu_assign_pointer(nf_ct_protos[l4proto->l4proto],
 			   &nf_conntrack_l4proto_generic);
 }
 
@@ -352,7 +321,7 @@ static int
 nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
 		       unsigned int num_proto)
 {
-	int ret = -EINVAL, ver;
+	int ret = -EINVAL;
 	unsigned int i;
 
 	for (i = 0; i < num_proto; i++) {
@@ -361,9 +330,8 @@ nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
 			break;
 	}
 	if (i != num_proto) {
-		ver = l4proto[i]->l3proto == PF_INET6 ? 6 : 4;
-		pr_err("nf_conntrack_ipv%d: can't register l4 %d proto.\n",
-		       ver, l4proto[i]->l4proto);
+		pr_err("nf_conntrack: can't register l4 %d proto.\n",
+		       l4proto[i]->l4proto);
 		nf_ct_l4proto_unregister(l4proto, i);
 	}
 	return ret;
@@ -382,9 +350,8 @@ int nf_ct_l4proto_pernet_register(struct net *net,
 			break;
 	}
 	if (i != num_proto) {
-		pr_err("nf_conntrack_proto_%d %d: pernet registration failed\n",
-		       l4proto[i]->l4proto,
-		       l4proto[i]->l3proto == PF_INET6 ? 6 : 4);
+		pr_err("nf_conntrack %d: pernet registration failed\n",
+		       l4proto[i]->l4proto);
 		nf_ct_l4proto_pernet_unregister(net, l4proto, i);
 	}
 	return ret;
@@ -911,37 +878,26 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto)
 EXPORT_SYMBOL_GPL(nf_ct_netns_put);
 
 static const struct nf_conntrack_l4proto * const builtin_l4proto[] = {
-	&nf_conntrack_l4proto_tcp4,
-	&nf_conntrack_l4proto_udp4,
+	&nf_conntrack_l4proto_tcp,
+	&nf_conntrack_l4proto_udp,
 	&nf_conntrack_l4proto_icmp,
 #ifdef CONFIG_NF_CT_PROTO_DCCP
-	&nf_conntrack_l4proto_dccp4,
+	&nf_conntrack_l4proto_dccp,
 #endif
 #ifdef CONFIG_NF_CT_PROTO_SCTP
-	&nf_conntrack_l4proto_sctp4,
+	&nf_conntrack_l4proto_sctp,
 #endif
 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
-	&nf_conntrack_l4proto_udplite4,
+	&nf_conntrack_l4proto_udplite,
 #endif
 #if IS_ENABLED(CONFIG_IPV6)
-	&nf_conntrack_l4proto_tcp6,
-	&nf_conntrack_l4proto_udp6,
 	&nf_conntrack_l4proto_icmpv6,
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	&nf_conntrack_l4proto_dccp6,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_SCTP
-	&nf_conntrack_l4proto_sctp6,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
-	&nf_conntrack_l4proto_udplite6,
-#endif
 #endif /* CONFIG_IPV6 */
 };
 
 int nf_conntrack_proto_init(void)
 {
-	int ret = 0;
+	int ret = 0, i;
 
 	ret = nf_register_sockopt(&so_getorigdst);
 	if (ret < 0)
@@ -952,6 +908,11 @@ int nf_conntrack_proto_init(void)
 	if (ret < 0)
 		goto cleanup_sockopt;
 #endif
+
+	for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
+		RCU_INIT_POINTER(nf_ct_protos[i],
+				 &nf_conntrack_l4proto_generic);
+
 	ret = nf_ct_l4proto_register(builtin_l4proto,
 				     ARRAY_SIZE(builtin_l4proto));
 	if (ret < 0)
@@ -969,17 +930,10 @@ cleanup_sockopt:
 
 void nf_conntrack_proto_fini(void)
 {
-	unsigned int i;
-
 	nf_unregister_sockopt(&so_getorigdst);
 #if IS_ENABLED(CONFIG_IPV6)
 	nf_unregister_sockopt(&so_getorigdst6);
 #endif
-	/* No need to call nf_ct_l4proto_unregister(), the register
-	 * tables are free'd here anyway.
-	 */
-	for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
-		kfree(nf_ct_protos[i]);
 }
 
 int nf_conntrack_proto_pernet_init(struct net *net)
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index d22852ae2316..171e9e122e5f 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -842,8 +842,7 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net)
 	return &net->ct.nf_ct_proto.dccp.pn;
 }
 
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
-	.l3proto		= AF_INET,
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = {
 	.l4proto		= IPPROTO_DCCP,
 	.packet			= dccp_packet,
 	.can_early_drop		= dccp_can_early_drop,
@@ -871,35 +870,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
 	.init_net		= dccp_init_net,
 	.get_net_proto		= dccp_get_net_proto,
 };
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
-	.l3proto		= AF_INET6,
-	.l4proto		= IPPROTO_DCCP,
-	.packet			= dccp_packet,
-	.can_early_drop		= dccp_can_early_drop,
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
-	.print_conntrack	= dccp_print_conntrack,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.nlattr_size		= DCCP_NLATTR_SIZE,
-	.to_nlattr		= dccp_to_nlattr,
-	.from_nlattr		= nlattr_to_dccp,
-	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
-	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
-	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-	.nla_policy		= nf_ct_port_nla_policy,
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-	.ctnl_timeout		= {
-		.nlattr_to_obj	= dccp_timeout_nlattr_to_obj,
-		.obj_to_nlattr	= dccp_timeout_obj_to_nlattr,
-		.nlattr_max	= CTA_TIMEOUT_DCCP_MAX,
-		.obj_size	= sizeof(unsigned int) * CT_DCCP_MAX,
-		.nla_policy	= dccp_timeout_nla_policy,
-	},
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-	.init_net		= dccp_init_net,
-	.get_net_proto		= dccp_get_net_proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6);
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 4530f76b51bd..e10e867e0b55 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -153,7 +153,6 @@ static struct nf_proto_net *generic_get_net_proto(struct net *net)
 
 const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
 {
-	.l3proto		= PF_UNSPEC,
 	.l4proto		= 255,
 	.pkt_to_tuple		= generic_pkt_to_tuple,
 	.packet			= generic_packet,
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 810039dcd779..9b48dc8b4b88 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -238,6 +238,9 @@ static int gre_packet(struct nf_conn *ct,
 		      enum ip_conntrack_info ctinfo,
 		      const struct nf_hook_state *state)
 {
+	if (state->pf != NFPROTO_IPV4)
+		return -NF_ACCEPT;
+
 	if (!nf_ct_is_confirmed(ct)) {
 		unsigned int *timeouts = nf_ct_timeout_lookup(ct);
 
@@ -344,7 +347,6 @@ static int gre_init_net(struct net *net)
 
 /* protocol helper struct */
 static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
-	.l3proto	 = AF_INET,
 	.l4proto	 = IPPROTO_GRE,
 	.pkt_to_tuple	 = gre_pkt_to_tuple,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index a9d2769f72d8..3598520bd19b 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -90,6 +90,9 @@ static int icmp_packet(struct nf_conn *ct,
 		[ICMP_ADDRESS] = 1
 	};
 
+	if (state->pf != NFPROTO_IPV4)
+		return -NF_ACCEPT;
+
 	if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
 	    !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
 		/* Can't create a new ICMP `conn' with this. */
@@ -131,7 +134,7 @@ icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb,
 	}
 
 	/* rcu_read_lock()ed by nf_hook_thresh */
-	innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
+	innerproto = __nf_ct_l4proto_find(origtuple.dst.protonum);
 
 	/* Ordinarily, we'd expect the inverted tupleproto, but it's
 	   been preserved inside the ICMP. */
@@ -349,7 +352,6 @@ static struct nf_proto_net *icmp_get_net_proto(struct net *net)
 
 const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
 {
-	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_ICMP,
 	.pkt_to_tuple		= icmp_pkt_to_tuple,
 	.invert_tuple		= icmp_invert_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 8a88db599d66..378618feed5d 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -103,6 +103,9 @@ static int icmpv6_packet(struct nf_conn *ct,
 		[ICMPV6_NI_QUERY - 128] = 1
 	};
 
+	if (state->pf != NFPROTO_IPV6)
+		return -NF_ACCEPT;
+
 	if (!nf_ct_is_confirmed(ct)) {
 		int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128;
 
@@ -150,7 +153,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 	}
 
 	/* rcu_read_lock()ed by nf_hook_thresh */
-	inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum);
+	inproto = __nf_ct_l4proto_find(origtuple.dst.protonum);
 
 	/* Ordinarily, we'd expect the inverted tupleproto, but it's
 	   been preserved inside the ICMP. */
@@ -360,7 +363,6 @@ static struct nf_proto_net *icmpv6_get_net_proto(struct net *net)
 
 const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
 {
-	.l3proto		= PF_INET6,
 	.l4proto		= IPPROTO_ICMPV6,
 	.pkt_to_tuple		= icmpv6_pkt_to_tuple,
 	.invert_tuple		= icmpv6_invert_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 9cf59ab280c0..3d719d3eb9a3 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -759,8 +759,7 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net)
 	return &net->ct.nf_ct_proto.sctp.pn;
 }
 
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
-	.l3proto		= PF_INET,
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = {
 	.l4proto 		= IPPROTO_SCTP,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack	= sctp_print_conntrack,
@@ -789,36 +788,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
 	.init_net		= sctp_init_net,
 	.get_net_proto		= sctp_get_net_proto,
 };
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
-	.l3proto		= PF_INET6,
-	.l4proto 		= IPPROTO_SCTP,
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
-	.print_conntrack	= sctp_print_conntrack,
-#endif
-	.packet 		= sctp_packet,
-	.can_early_drop		= sctp_can_early_drop,
-	.me 			= THIS_MODULE,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.nlattr_size		= SCTP_NLATTR_SIZE,
-	.to_nlattr		= sctp_to_nlattr,
-	.from_nlattr		= nlattr_to_sctp,
-	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
-	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
-	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-	.nla_policy		= nf_ct_port_nla_policy,
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-	.ctnl_timeout		= {
-		.nlattr_to_obj	= sctp_timeout_nlattr_to_obj,
-		.obj_to_nlattr	= sctp_timeout_obj_to_nlattr,
-		.nlattr_max	= CTA_TIMEOUT_SCTP_MAX,
-		.obj_size	= sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
-		.nla_policy	= sctp_timeout_nla_policy,
-	},
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-	.init_net		= sctp_init_net,
-	.get_net_proto		= sctp_get_net_proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp6);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index f4954fa7be25..643e4edfa0c1 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1534,9 +1534,8 @@ static struct nf_proto_net *tcp_get_net_proto(struct net *net)
 	return &net->ct.nf_ct_proto.tcp.pn;
 }
 
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
 {
-	.l3proto		= PF_INET,
 	.l4proto 		= IPPROTO_TCP,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack 	= tcp_print_conntrack,
@@ -1565,37 +1564,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
 	.init_net		= tcp_init_net,
 	.get_net_proto		= tcp_get_net_proto,
 };
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
-{
-	.l3proto		= PF_INET6,
-	.l4proto 		= IPPROTO_TCP,
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
-	.print_conntrack 	= tcp_print_conntrack,
-#endif
-	.packet 		= tcp_packet,
-	.can_early_drop		= tcp_can_early_drop,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.nlattr_size		= TCP_NLATTR_SIZE,
-	.to_nlattr		= tcp_to_nlattr,
-	.from_nlattr		= nlattr_to_tcp,
-	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
-	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-	.nlattr_tuple_size	= tcp_nlattr_tuple_size,
-	.nla_policy		= nf_ct_port_nla_policy,
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-	.ctnl_timeout		= {
-		.nlattr_to_obj	= tcp_timeout_nlattr_to_obj,
-		.obj_to_nlattr	= tcp_timeout_obj_to_nlattr,
-		.nlattr_max	= CTA_TIMEOUT_TCP_MAX,
-		.obj_size	= sizeof(unsigned int) *
-					TCP_CONNTRACK_TIMEOUT_MAX,
-		.nla_policy	= tcp_timeout_nla_policy,
-	},
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-	.init_net		= tcp_init_net,
-	.get_net_proto		= tcp_get_net_proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 4645bf5b20c8..a7aa70370913 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -310,9 +310,8 @@ static struct nf_proto_net *udp_get_net_proto(struct net *net)
 	return &net->ct.nf_ct_proto.udp.pn;
 }
 
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp =
 {
-	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_UDP,
 	.allow_clash		= true,
 	.packet			= udp_packet,
@@ -334,12 +333,10 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
 	.init_net		= udp_init_net,
 	.get_net_proto		= udp_get_net_proto,
 };
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
 
 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite =
 {
-	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_UDPLITE,
 	.allow_clash		= true,
 	.packet			= udplite_packet,
@@ -361,59 +358,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
 	.init_net		= udp_init_net,
 	.get_net_proto		= udp_get_net_proto,
 };
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
-#endif
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
-{
-	.l3proto		= PF_INET6,
-	.l4proto		= IPPROTO_UDP,
-	.allow_clash		= true,
-	.packet			= udp_packet,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
-	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
-	.nla_policy		= nf_ct_port_nla_policy,
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-	.ctnl_timeout		= {
-		.nlattr_to_obj	= udp_timeout_nlattr_to_obj,
-		.obj_to_nlattr	= udp_timeout_obj_to_nlattr,
-		.nlattr_max	= CTA_TIMEOUT_UDP_MAX,
-		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
-		.nla_policy	= udp_timeout_nla_policy,
-	},
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-	.init_net		= udp_init_net,
-	.get_net_proto		= udp_get_net_proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
-
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
-{
-	.l3proto		= PF_INET6,
-	.l4proto		= IPPROTO_UDPLITE,
-	.allow_clash		= true,
-	.packet			= udplite_packet,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
-	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
-	.nla_policy		= nf_ct_port_nla_policy,
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-	.ctnl_timeout		= {
-		.nlattr_to_obj	= udp_timeout_nlattr_to_obj,
-		.obj_to_nlattr	= udp_timeout_obj_to_nlattr,
-		.nlattr_max	= CTA_TIMEOUT_UDP_MAX,
-		.obj_size	= sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
-		.nla_policy	= udp_timeout_nla_policy,
-	},
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-	.init_net		= udp_init_net,
-	.get_net_proto		= udp_get_net_proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
 #endif
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index e3b329ebafd3..463d17d349c1 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -292,7 +292,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	if (!net_eq(nf_ct_net(ct), net))
 		goto release;
 
-	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
 	WARN_ON(!l4proto);
 
 	ret = -ENOSPC;
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index d8125616edc7..0c233cfcc84d 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -120,7 +120,7 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct)
 	if (l4num == IPPROTO_TCP)
 		flow_offload_fixup_tcp(&ct->proto.tcp);
 
-	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), l4num);
+	l4proto = __nf_ct_l4proto_find(l4num);
 	if (!l4proto)
 		return;
 
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 6ca0df7f416f..b48545b84ce8 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -122,7 +122,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
 		return -EBUSY;
 	}
 
-	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+	l4proto = nf_ct_l4proto_find_get(l4num);
 
 	/* This protocol is not supportted, skip. */
 	if (l4proto->l4proto != l4num) {
@@ -361,7 +361,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
 
 	l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
 	l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
-	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+	l4proto = nf_ct_l4proto_find_get(l4num);
 
 	/* This protocol is not supported, skip. */
 	if (l4proto->l4proto != l4num) {
@@ -383,7 +383,7 @@ err:
 
 static int
 cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
-			    u32 seq, u32 type, int event,
+			    u32 seq, u32 type, int event, u16 l3num,
 			    const struct nf_conntrack_l4proto *l4proto)
 {
 	struct nlmsghdr *nlh;
@@ -402,7 +402,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
 	nfmsg->version = NFNETLINK_V0;
 	nfmsg->res_id = 0;
 
-	if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) ||
+	if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) ||
 	    nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
 		goto nla_put_failure;
 
@@ -442,7 +442,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
 
 	l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
 	l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
-	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+	l4proto = nf_ct_l4proto_find_get(l4num);
 
 	/* This protocol is not supported, skip. */
 	if (l4proto->l4proto != l4num) {
@@ -460,6 +460,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
 					  nlh->nlmsg_seq,
 					  NFNL_MSG_TYPE(nlh->nlmsg_type),
 					  IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
+					  l3num,
 					  l4proto);
 	if (ret <= 0) {
 		kfree_skb(skb2);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 17ae5059c312..d74afa70774f 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -855,7 +855,7 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
 	l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]);
 	priv->l4proto = l4num;
 
-	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+	l4proto = nf_ct_l4proto_find_get(l4num);
 
 	if (l4proto->l4proto != l4num) {
 		ret = -EOPNOTSUPP;
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 89457efd2e00..2c7a4b80206f 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -159,7 +159,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
 	/* Make sure the timeout policy matches any existing protocol tracker,
 	 * otherwise default to generic.
 	 */
-	l4proto = __nf_ct_l4proto_find(par->family, proto);
+	l4proto = __nf_ct_l4proto_find(proto);
 	if (timeout->l4proto->l4proto != l4proto->l4proto) {
 		ret = -EINVAL;
 		pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n",
-- 
cgit v1.2.3


From 93185c80a5f748620f5652e492f2a1c8d89db593 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 12 Sep 2018 15:19:14 +0200
Subject: netfilter: conntrack: clamp l4proto array size at largers supported
 protocol

All higher l4proto numbers are handled by the generic tracker; the
l4proto lookup function already returns generic one in case the l4proto
number exceeds max size.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h | 2 +-
 net/netfilter/nf_conntrack_proto.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index d838a93430a1..eed04af9b75e 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -102,7 +102,7 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
 /* Existing built-in generic protocol */
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
 
-#define MAX_NF_CT_PROTO 256
+#define MAX_NF_CT_PROTO IPPROTO_UDPLITE
 
 const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u8 l4proto);
 
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 69d7170cfa8c..40643af7137e 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -43,7 +43,7 @@
 
 extern unsigned int nf_conntrack_net_id;
 
-static struct nf_conntrack_l4proto __rcu *nf_ct_protos[MAX_NF_CT_PROTO] __read_mostly;
+static struct nf_conntrack_l4proto __rcu *nf_ct_protos[MAX_NF_CT_PROTO + 1] __read_mostly;
 
 static DEFINE_MUTEX(nf_ct_proto_mutex);
 
-- 
cgit v1.2.3


From 78f2756c5fc0bf17560766dbc5aaa1e4a7ba66e4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 20 Sep 2018 13:50:47 -0700
Subject: net/ipv4: Move device validation to helper

Move the device matching check in __fib_validate_source to a helper and
export it for use by netfilter modules. Code move only; no functional
change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  1 +
 net/ipv4/fib_frontend.c | 44 +++++++++++++++++++++++++++-----------------
 2 files changed, 28 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 69c91d1934c1..f7c109e37298 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -373,6 +373,7 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net,
 extern const struct nla_policy rtm_ipv4_policy[];
 void ip_fib_init(void);
 __be32 fib_compute_spec_dst(struct sk_buff *skb);
+bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev);
 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			u8 tos, int oif, struct net_device *dev,
 			struct in_device *idev, u32 *itag);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2998b0e47d4b..222b968de94c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -315,6 +315,32 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 	return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 }
 
+bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
+{
+	bool dev_match = false;
+	int ret;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	for (ret = 0; ret < fi->fib_nhs; ret++) {
+		struct fib_nh *nh = &fi->fib_nh[ret];
+
+		if (nh->nh_dev == dev) {
+			dev_match = true;
+			break;
+		} else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
+			dev_match = true;
+			break;
+		}
+	}
+#else
+	if (fi->fib_nh[0].nh_dev == dev)
+		dev_match = true;
+#endif
+
+	return dev_match;
+}
+EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
+
 /* Given (packet source, input interface) and optional (dst, oif, tos):
  * - (main) check, that source is valid i.e. not broadcast or our local
  *   address.
@@ -361,24 +387,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	    (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
 		goto e_inval;
 	fib_combine_itag(itag, &res);
-	dev_match = false;
-
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-	for (ret = 0; ret < res.fi->fib_nhs; ret++) {
-		struct fib_nh *nh = &res.fi->fib_nh[ret];
 
-		if (nh->nh_dev == dev) {
-			dev_match = true;
-			break;
-		} else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
-			dev_match = true;
-			break;
-		}
-	}
-#else
-	if (FIB_RES_DEV(res) == dev)
-		dev_match = true;
-#endif
+	dev_match = fib_info_nh_uses_dev(res.fi, dev);
 	if (dev_match) {
 		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
 		return ret;
-- 
cgit v1.2.3


From 5d773ff41a7cdf0ef6cc6647435d59f0cf53e7b1 Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Mon, 17 Sep 2018 13:30:46 +0300
Subject: net/mlx5: Rename incorrect naming in IFC file

Remove a trailing underscore from the multicast/unicast names.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/qp.c                     | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_common.c | 2 +-
 include/linux/mlx5/mlx5_ifc.h                       | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 6cba2a02d11b..daf1eb84cd31 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1279,7 +1279,7 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
 
 	if (dev->rep)
 		MLX5_SET(tirc, tirc, self_lb_block,
-			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_);
+			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST);
 
 	err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn);
 
@@ -1582,7 +1582,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 create_tir:
 	if (dev->rep)
 		MLX5_SET(tirc, tirc, self_lb_block,
-			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_);
+			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST);
 
 	err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index db3278cc052b..3078491cc0d0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -153,7 +153,7 @@ int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb)
 
 	if (enable_uc_lb)
 		MLX5_SET(modify_tir_in, in, ctx.self_lb_block,
-			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_);
+			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST);
 
 	MLX5_SET(modify_tir_in, in, bitmask.self_lb_en, 1);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3a4a2e0567e9..4c7a1d25d73b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2559,8 +2559,8 @@ enum {
 };
 
 enum {
-	MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_    = 0x1,
-	MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST_  = 0x2,
+	MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST    = 0x1,
+	MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST  = 0x2,
 };
 
 struct mlx5_ifc_tirc_bits {
-- 
cgit v1.2.3


From a42055e8d2c30d4decfc13ce943d09c7b9dad221 Mon Sep 17 00:00:00 2001
From: Vakul Garg <vakul.garg@nxp.com>
Date: Fri, 21 Sep 2018 09:46:13 +0530
Subject: net/tls: Add support for async encryption of records for performance

In current implementation, tls records are encrypted & transmitted
serially. Till the time the previously submitted user data is encrypted,
the implementation waits and on finish starts transmitting the record.
This approach of encrypt-one record at a time is inefficient when
asynchronous crypto accelerators are used. For each record, there are
overheads of interrupts, driver softIRQ scheduling etc. Also the crypto
accelerator sits idle most of time while an encrypted record's pages are
handed over to tcp stack for transmission.

This patch enables encryption of multiple records in parallel when an
async capable crypto accelerator is present in system. This is achieved
by allowing the user space application to send more data using sendmsg()
even while previously issued data is being processed by crypto
accelerator. This requires returning the control back to user space
application after submitting encryption request to accelerator. This
also means that zero-copy mode of encryption cannot be used with async
accelerator as we must be done with user space application buffer before
returning from sendmsg().

There can be multiple records in flight to/from the accelerator. Each of
the record is represented by 'struct tls_rec'. This is used to store the
memory pages for the record.

After the records are encrypted, they are added in a linked list called
tx_ready_list which contains encrypted tls records sorted as per tls
sequence number. The records from tx_ready_list are transmitted using a
newly introduced function called tls_tx_records(). The tx_ready_list is
polled for any record ready to be transmitted in sendmsg(), sendpage()
after initiating encryption of new tls records. This achieves parallel
encryption and transmission of records when async accelerator is
present.

There could be situation when crypto accelerator completes encryption
later than polling of tx_ready_list by sendmsg()/sendpage(). Therefore
we need a deferred work context to be able to transmit records from
tx_ready_list. The deferred work context gets scheduled if applications
are not sending much data through the socket. If the applications issue
sendmsg()/sendpage() in quick succession, then the scheduling of
tx_work_handler gets cancelled as the tx_ready_list would be polled from
application's context itself. This saves scheduling overhead of deferred
work.

The patch also brings some side benefit. We are able to get rid of the
concept of CLOSED record. This is because the records once closed are
either encrypted and then placed into tx_ready_list or if encryption
fails, the socket error is set. This simplifies the kernel tls
sendpath. However since tls_device.c is still using macros, accessory
functions for CLOSED records have been retained.

Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h  |  70 +++++--
 net/tls/tls_main.c |  54 ++---
 net/tls/tls_sw.c   | 586 ++++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 522 insertions(+), 188 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 9f3c4ea9ad6f..3aa73e2d8823 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -41,7 +41,7 @@
 #include <linux/tcp.h>
 #include <net/tcp.h>
 #include <net/strparser.h>
-
+#include <crypto/aead.h>
 #include <uapi/linux/tls.h>
 
 
@@ -93,24 +93,47 @@ enum {
 	TLS_NUM_CONFIG,
 };
 
-struct tls_sw_context_tx {
-	struct crypto_aead *aead_send;
-	struct crypto_wait async_wait;
-
-	char aad_space[TLS_AAD_SPACE_SIZE];
-
-	unsigned int sg_plaintext_size;
-	int sg_plaintext_num_elem;
+/* TLS records are maintained in 'struct tls_rec'. It stores the memory pages
+ * allocated or mapped for each TLS record. After encryption, the records are
+ * stores in a linked list.
+ */
+struct tls_rec {
+	struct list_head list;
+	int tx_flags;
 	struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS];
-
-	unsigned int sg_encrypted_size;
-	int sg_encrypted_num_elem;
 	struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS];
 
 	/* AAD | sg_plaintext_data | sg_tag */
 	struct scatterlist sg_aead_in[2];
 	/* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */
 	struct scatterlist sg_aead_out[2];
+
+	unsigned int sg_plaintext_size;
+	unsigned int sg_encrypted_size;
+	int sg_plaintext_num_elem;
+	int sg_encrypted_num_elem;
+
+	char aad_space[TLS_AAD_SPACE_SIZE];
+	struct aead_request aead_req;
+	u8 aead_req_ctx[];
+};
+
+struct tx_work {
+	struct delayed_work work;
+	struct sock *sk;
+};
+
+struct tls_sw_context_tx {
+	struct crypto_aead *aead_send;
+	struct crypto_wait async_wait;
+	struct tx_work tx_work;
+	struct tls_rec *open_rec;
+	struct list_head tx_ready_list;
+	atomic_t encrypt_pending;
+	int async_notify;
+
+#define BIT_TX_SCHEDULED	0
+	unsigned long tx_bitmask;
 };
 
 struct tls_sw_context_rx {
@@ -197,6 +220,8 @@ struct tls_context {
 
 	struct scatterlist *partially_sent_record;
 	u16 partially_sent_offset;
+	u64 tx_seq_number;	/* Next TLS seqnum to be transmitted */
+
 	unsigned long flags;
 	bool in_tcp_sendpages;
 
@@ -261,6 +286,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page,
 void tls_device_sk_destruct(struct sock *sk);
 void tls_device_init(void);
 void tls_device_cleanup(void);
+int tls_tx_records(struct sock *sk, int flags);
 
 struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
 				       u32 seq, u64 *p_record_sn);
@@ -279,6 +305,9 @@ void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
 int tls_push_sg(struct sock *sk, struct tls_context *ctx,
 		struct scatterlist *sg, u16 first_offset,
 		int flags);
+int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
+			    int flags);
+
 int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx,
 				   int flags, long *timeo);
 
@@ -312,6 +341,23 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx)
 	return tls_ctx->pending_open_record_frags;
 }
 
+static inline bool is_tx_ready(struct tls_context *tls_ctx,
+			       struct tls_sw_context_tx *ctx)
+{
+	struct tls_rec *rec;
+	u64 seq;
+
+	rec = list_first_entry(&ctx->tx_ready_list, struct tls_rec, list);
+	if (!rec)
+		return false;
+
+	seq = be64_to_cpup((const __be64 *)&rec->aad_space);
+	if (seq == tls_ctx->tx_seq_number)
+		return true;
+	else
+		return false;
+}
+
 struct sk_buff *
 tls_validate_xmit_skb(struct sock *sk, struct net_device *dev,
 		      struct sk_buff *skb);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 523622dc74f8..06094de7a3d9 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -141,7 +141,6 @@ retry:
 		size = sg->length;
 	}
 
-	clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
 	ctx->in_tcp_sendpages = false;
 	ctx->sk_write_space(sk);
 
@@ -193,15 +192,12 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
 	return rc;
 }
 
-int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx,
-				   int flags, long *timeo)
+int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
+			    int flags)
 {
 	struct scatterlist *sg;
 	u16 offset;
 
-	if (!tls_is_partially_sent_record(ctx))
-		return ctx->push_pending_record(sk, flags);
-
 	sg = ctx->partially_sent_record;
 	offset = ctx->partially_sent_offset;
 
@@ -209,9 +205,23 @@ int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx,
 	return tls_push_sg(sk, ctx, sg, offset, flags);
 }
 
+int tls_push_pending_closed_record(struct sock *sk,
+				   struct tls_context *tls_ctx,
+				   int flags, long *timeo)
+{
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+
+	if (tls_is_partially_sent_record(tls_ctx) ||
+	    !list_empty(&ctx->tx_ready_list))
+		return tls_tx_records(sk, flags);
+	else
+		return tls_ctx->push_pending_record(sk, flags);
+}
+
 static void tls_write_space(struct sock *sk)
 {
 	struct tls_context *ctx = tls_get_ctx(sk);
+	struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx);
 
 	/* If in_tcp_sendpages call lower protocol write space handler
 	 * to ensure we wake up any waiting operations there. For example
@@ -222,20 +232,11 @@ static void tls_write_space(struct sock *sk)
 		return;
 	}
 
-	if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) {
-		gfp_t sk_allocation = sk->sk_allocation;
-		int rc;
-		long timeo = 0;
-
-		sk->sk_allocation = GFP_ATOMIC;
-		rc = tls_push_pending_closed_record(sk, ctx,
-						    MSG_DONTWAIT |
-						    MSG_NOSIGNAL,
-						    &timeo);
-		sk->sk_allocation = sk_allocation;
-
-		if (rc < 0)
-			return;
+	/* Schedule the transmission if tx list is ready */
+	if (is_tx_ready(ctx, tx_ctx) && !sk->sk_write_pending) {
+		/* Schedule the transmission */
+		if (!test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask))
+			schedule_delayed_work(&tx_ctx->tx_work.work, 0);
 	}
 
 	ctx->sk_write_space(sk);
@@ -270,19 +271,6 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 	if (!tls_complete_pending_work(sk, ctx, 0, &timeo))
 		tls_handle_open_record(sk, 0);
 
-	if (ctx->partially_sent_record) {
-		struct scatterlist *sg = ctx->partially_sent_record;
-
-		while (1) {
-			put_page(sg_page(sg));
-			sk_mem_uncharge(sk, sg->length);
-
-			if (sg_is_last(sg))
-				break;
-			sg++;
-		}
-	}
-
 	/* We need these for tls_sw_fallback handling of other packets */
 	if (ctx->tx_conf == TLS_SW) {
 		kfree(ctx->tx.rec_seq);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 5ff51bac8b46..bcb24c498b84 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -246,18 +246,19 @@ static void trim_both_sgl(struct sock *sk, int target_size)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct tls_rec *rec = ctx->open_rec;
 
-	trim_sg(sk, ctx->sg_plaintext_data,
-		&ctx->sg_plaintext_num_elem,
-		&ctx->sg_plaintext_size,
+	trim_sg(sk, rec->sg_plaintext_data,
+		&rec->sg_plaintext_num_elem,
+		&rec->sg_plaintext_size,
 		target_size);
 
 	if (target_size > 0)
 		target_size += tls_ctx->tx.overhead_size;
 
-	trim_sg(sk, ctx->sg_encrypted_data,
-		&ctx->sg_encrypted_num_elem,
-		&ctx->sg_encrypted_size,
+	trim_sg(sk, rec->sg_encrypted_data,
+		&rec->sg_encrypted_num_elem,
+		&rec->sg_encrypted_size,
 		target_size);
 }
 
@@ -265,15 +266,16 @@ static int alloc_encrypted_sg(struct sock *sk, int len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct tls_rec *rec = ctx->open_rec;
 	int rc = 0;
 
 	rc = sk_alloc_sg(sk, len,
-			 ctx->sg_encrypted_data, 0,
-			 &ctx->sg_encrypted_num_elem,
-			 &ctx->sg_encrypted_size, 0);
+			 rec->sg_encrypted_data, 0,
+			 &rec->sg_encrypted_num_elem,
+			 &rec->sg_encrypted_size, 0);
 
 	if (rc == -ENOSPC)
-		ctx->sg_encrypted_num_elem = ARRAY_SIZE(ctx->sg_encrypted_data);
+		rec->sg_encrypted_num_elem = ARRAY_SIZE(rec->sg_encrypted_data);
 
 	return rc;
 }
@@ -282,14 +284,15 @@ static int alloc_plaintext_sg(struct sock *sk, int len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct tls_rec *rec = ctx->open_rec;
 	int rc = 0;
 
-	rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0,
-			 &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size,
+	rc = sk_alloc_sg(sk, len, rec->sg_plaintext_data, 0,
+			 &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size,
 			 tls_ctx->pending_open_record_frags);
 
 	if (rc == -ENOSPC)
-		ctx->sg_plaintext_num_elem = ARRAY_SIZE(ctx->sg_plaintext_data);
+		rec->sg_plaintext_num_elem = ARRAY_SIZE(rec->sg_plaintext_data);
 
 	return rc;
 }
@@ -311,37 +314,192 @@ static void tls_free_both_sg(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct tls_rec *rec = ctx->open_rec;
 
-	free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem,
-		&ctx->sg_encrypted_size);
+	/* Return if there is no open record */
+	if (!rec)
+		return;
+
+	free_sg(sk, rec->sg_encrypted_data,
+		&rec->sg_encrypted_num_elem,
+		&rec->sg_encrypted_size);
+
+	free_sg(sk, rec->sg_plaintext_data,
+		&rec->sg_plaintext_num_elem,
+		&rec->sg_plaintext_size);
+}
+
+static bool append_tx_ready_list(struct tls_context *tls_ctx,
+				 struct tls_sw_context_tx *ctx,
+				 struct tls_rec *enc_rec)
+{
+	u64 new_seq = be64_to_cpup((const __be64 *)&enc_rec->aad_space);
+	struct list_head *pos;
+
+	/* Need to insert encrypted record in tx_ready_list sorted
+	 * as per sequence number. Traverse linked list from tail.
+	 */
+	list_for_each_prev(pos, &ctx->tx_ready_list) {
+		struct tls_rec *rec = (struct tls_rec *)pos;
+		u64 seq = be64_to_cpup((const __be64 *)&rec->aad_space);
+
+		if (new_seq > seq)
+			break;
+	}
+
+	list_add((struct list_head *)&enc_rec->list, pos);
+
+	return is_tx_ready(tls_ctx, ctx);
+}
+
+int tls_tx_records(struct sock *sk, int flags)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct tls_rec *rec, *tmp;
+	int tx_flags, rc = 0;
+
+	if (tls_is_partially_sent_record(tls_ctx)) {
+		rec = list_first_entry(&ctx->tx_ready_list,
+				       struct tls_rec, list);
+
+		if (flags == -1)
+			tx_flags = rec->tx_flags;
+		else
+			tx_flags = flags;
+
+		rc = tls_push_partial_record(sk, tls_ctx, tx_flags);
+		if (rc)
+			goto tx_err;
+
+		/* Full record has been transmitted.
+		 * Remove the head of tx_ready_list
+		 */
+		tls_ctx->tx_seq_number++;
+		list_del(&rec->list);
+		kfree(rec);
+	}
+
+	/* Tx all ready records which have expected sequence number */
+	list_for_each_entry_safe(rec, tmp, &ctx->tx_ready_list, list) {
+		u64 seq = be64_to_cpup((const __be64 *)&rec->aad_space);
+
+		if (seq == tls_ctx->tx_seq_number) {
+			if (flags == -1)
+				tx_flags = rec->tx_flags;
+			else
+				tx_flags = flags;
+
+			rc = tls_push_sg(sk, tls_ctx,
+					 &rec->sg_encrypted_data[0],
+					 0, tx_flags);
+			if (rc)
+				goto tx_err;
+
+			tls_ctx->tx_seq_number++;
+			list_del(&rec->list);
+			kfree(rec);
+		} else {
+			break;
+		}
+	}
+
+tx_err:
+	if (rc < 0 && rc != -EAGAIN)
+		tls_err_abort(sk, EBADMSG);
+
+	return rc;
+}
+
+static void tls_encrypt_done(struct crypto_async_request *req, int err)
+{
+	struct aead_request *aead_req = (struct aead_request *)req;
+	struct sock *sk = req->data;
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct tls_rec *rec;
+	bool ready = false;
+	int pending;
+
+	rec = container_of(aead_req, struct tls_rec, aead_req);
+
+	rec->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size;
+	rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size;
+
+	free_sg(sk, rec->sg_plaintext_data,
+		&rec->sg_plaintext_num_elem, &rec->sg_plaintext_size);
+
+	/* Free the record if error is previously set on socket */
+	if (err || sk->sk_err) {
+		free_sg(sk, rec->sg_encrypted_data,
+			&rec->sg_encrypted_num_elem, &rec->sg_encrypted_size);
 
-	free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem,
-		&ctx->sg_plaintext_size);
+		kfree(rec);
+		rec = NULL;
+
+		/* If err is already set on socket, return the same code */
+		if (sk->sk_err) {
+			ctx->async_wait.err = sk->sk_err;
+		} else {
+			ctx->async_wait.err = err;
+			tls_err_abort(sk, err);
+		}
+	}
+
+	/* Append the record in tx queue */
+	if (rec)
+		ready = append_tx_ready_list(tls_ctx, ctx, rec);
+
+	pending = atomic_dec_return(&ctx->encrypt_pending);
+
+	if (!pending && READ_ONCE(ctx->async_notify))
+		complete(&ctx->async_wait.completion);
+
+	if (!ready)
+		return;
+
+	/* Schedule the transmission */
+	if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
+		schedule_delayed_work(&ctx->tx_work.work, 1);
 }
 
-static int tls_do_encryption(struct tls_context *tls_ctx,
+static int tls_do_encryption(struct sock *sk,
+			     struct tls_context *tls_ctx,
 			     struct tls_sw_context_tx *ctx,
 			     struct aead_request *aead_req,
 			     size_t data_len)
 {
+	struct tls_rec *rec = ctx->open_rec;
 	int rc;
 
-	ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size;
-	ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size;
+	rec->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size;
+	rec->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size;
 
 	aead_request_set_tfm(aead_req, ctx->aead_send);
 	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
-	aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out,
+	aead_request_set_crypt(aead_req, rec->sg_aead_in,
+			       rec->sg_aead_out,
 			       data_len, tls_ctx->tx.iv);
 
 	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
-				  crypto_req_done, &ctx->async_wait);
+				  tls_encrypt_done, sk);
+
+	atomic_inc(&ctx->encrypt_pending);
 
-	rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait);
+	rc = crypto_aead_encrypt(aead_req);
+	if (!rc || rc != -EINPROGRESS) {
+		atomic_dec(&ctx->encrypt_pending);
+		rec->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size;
+		rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size;
+	}
 
-	ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size;
-	ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size;
+	/* Case of encryption failure */
+	if (rc && rc != -EINPROGRESS)
+		return rc;
 
+	/* Unhook the record from context if encryption is not failure */
+	ctx->open_rec = NULL;
+	tls_advance_record_sn(sk, &tls_ctx->tx);
 	return rc;
 }
 
@@ -350,53 +508,49 @@ static int tls_push_record(struct sock *sk, int flags,
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct tls_rec *rec = ctx->open_rec;
 	struct aead_request *req;
 	int rc;
 
-	req = aead_request_alloc(ctx->aead_send, sk->sk_allocation);
-	if (!req)
-		return -ENOMEM;
+	if (!rec)
+		return 0;
 
-	sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1);
-	sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1);
+	rec->tx_flags = flags;
+	req = &rec->aead_req;
 
-	tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size,
+	sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem - 1);
+	sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem - 1);
+
+	tls_make_aad(rec->aad_space, rec->sg_plaintext_size,
 		     tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size,
 		     record_type);
 
 	tls_fill_prepend(tls_ctx,
-			 page_address(sg_page(&ctx->sg_encrypted_data[0])) +
-			 ctx->sg_encrypted_data[0].offset,
-			 ctx->sg_plaintext_size, record_type);
+			 page_address(sg_page(&rec->sg_encrypted_data[0])) +
+			 rec->sg_encrypted_data[0].offset,
+			 rec->sg_plaintext_size, record_type);
 
 	tls_ctx->pending_open_record_frags = 0;
-	set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags);
-
-	rc = tls_do_encryption(tls_ctx, ctx, req, ctx->sg_plaintext_size);
-	if (rc < 0) {
-		/* If we are called from write_space and
-		 * we fail, we need to set this SOCK_NOSPACE
-		 * to trigger another write_space in the future.
-		 */
-		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		goto out_req;
-	}
 
-	free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem,
-		&ctx->sg_plaintext_size);
+	rc = tls_do_encryption(sk, tls_ctx, ctx, req, rec->sg_plaintext_size);
+	if (rc == -EINPROGRESS)
+		return -EINPROGRESS;
 
-	ctx->sg_encrypted_num_elem = 0;
-	ctx->sg_encrypted_size = 0;
+	free_sg(sk, rec->sg_plaintext_data, &rec->sg_plaintext_num_elem,
+		&rec->sg_plaintext_size);
 
-	/* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */
-	rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags);
-	if (rc < 0 && rc != -EAGAIN)
+	if (rc < 0) {
 		tls_err_abort(sk, EBADMSG);
+		return rc;
+	}
 
-	tls_advance_record_sn(sk, &tls_ctx->tx);
-out_req:
-	aead_request_free(req);
-	return rc;
+	/* Put the record in tx_ready_list and start tx if permitted.
+	 * This happens only when encryption is not asynchronous.
+	 */
+	if (append_tx_ready_list(tls_ctx, ctx, rec))
+		return tls_tx_records(sk, flags);
+
+	return 0;
 }
 
 static int tls_sw_push_pending_record(struct sock *sk, int flags)
@@ -473,11 +627,12 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
-	struct scatterlist *sg = ctx->sg_plaintext_data;
+	struct tls_rec *rec = ctx->open_rec;
+	struct scatterlist *sg = rec->sg_plaintext_data;
 	int copy, i, rc = 0;
 
 	for (i = tls_ctx->pending_open_record_frags;
-	     i < ctx->sg_plaintext_num_elem; ++i) {
+	     i < rec->sg_plaintext_num_elem; ++i) {
 		copy = sg[i].length;
 		if (copy_from_iter(
 				page_address(sg_page(&sg[i])) + sg[i].offset,
@@ -497,34 +652,85 @@ out:
 	return rc;
 }
 
-int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+struct tls_rec *get_rec(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
-	int ret;
-	int required_size;
+	struct tls_rec *rec;
+	int mem_size;
+
+	/* Return if we already have an open record */
+	if (ctx->open_rec)
+		return ctx->open_rec;
+
+	mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send);
+
+	rec = kzalloc(mem_size, sk->sk_allocation);
+	if (!rec)
+		return NULL;
+
+	sg_init_table(&rec->sg_plaintext_data[0],
+		      ARRAY_SIZE(rec->sg_plaintext_data));
+	sg_init_table(&rec->sg_encrypted_data[0],
+		      ARRAY_SIZE(rec->sg_encrypted_data));
+
+	sg_init_table(rec->sg_aead_in, 2);
+	sg_set_buf(&rec->sg_aead_in[0], rec->aad_space,
+		   sizeof(rec->aad_space));
+	sg_unmark_end(&rec->sg_aead_in[1]);
+	sg_chain(rec->sg_aead_in, 2, rec->sg_plaintext_data);
+
+	sg_init_table(rec->sg_aead_out, 2);
+	sg_set_buf(&rec->sg_aead_out[0], rec->aad_space,
+		   sizeof(rec->aad_space));
+	sg_unmark_end(&rec->sg_aead_out[1]);
+	sg_chain(rec->sg_aead_out, 2, rec->sg_encrypted_data);
+
+	ctx->open_rec = rec;
+
+	return rec;
+}
+
+int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
 	long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct crypto_tfm *tfm = crypto_aead_tfm(ctx->aead_send);
+	bool async_capable = tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC;
+	unsigned char record_type = TLS_RECORD_TYPE_DATA;
+	bool is_kvec = msg->msg_iter.type & ITER_KVEC;
 	bool eor = !(msg->msg_flags & MSG_MORE);
 	size_t try_to_copy, copied = 0;
-	unsigned char record_type = TLS_RECORD_TYPE_DATA;
-	int record_room;
+	struct tls_rec *rec;
+	int required_size;
+	int num_async = 0;
 	bool full_record;
+	int record_room;
+	int num_zc = 0;
 	int orig_size;
-	bool is_kvec = msg->msg_iter.type & ITER_KVEC;
+	int ret;
 
 	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
 		return -ENOTSUPP;
 
 	lock_sock(sk);
 
-	ret = tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo);
-	if (ret)
-		goto send_end;
+	/* Wait till there is any pending write on socket */
+	if (unlikely(sk->sk_write_pending)) {
+		ret = wait_on_pending_writer(sk, &timeo);
+		if (unlikely(ret))
+			goto send_end;
+	}
 
 	if (unlikely(msg->msg_controllen)) {
 		ret = tls_proccess_cmsg(sk, msg, &record_type);
-		if (ret)
-			goto send_end;
+		if (ret) {
+			if (ret == -EINPROGRESS)
+				num_async++;
+			else if (ret != -EAGAIN)
+				goto send_end;
+		}
 	}
 
 	while (msg_data_left(msg)) {
@@ -533,20 +739,27 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			goto send_end;
 		}
 
-		orig_size = ctx->sg_plaintext_size;
+		rec = get_rec(sk);
+		if (!rec) {
+			ret = -ENOMEM;
+			goto send_end;
+		}
+
+		orig_size = rec->sg_plaintext_size;
 		full_record = false;
 		try_to_copy = msg_data_left(msg);
-		record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size;
+		record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size;
 		if (try_to_copy >= record_room) {
 			try_to_copy = record_room;
 			full_record = true;
 		}
 
-		required_size = ctx->sg_plaintext_size + try_to_copy +
+		required_size = rec->sg_plaintext_size + try_to_copy +
 				tls_ctx->tx.overhead_size;
 
 		if (!sk_stream_memory_free(sk))
 			goto wait_for_sndbuf;
+
 alloc_encrypted:
 		ret = alloc_encrypted_sg(sk, required_size);
 		if (ret) {
@@ -557,33 +770,39 @@ alloc_encrypted:
 			 * actually allocated. The difference is due
 			 * to max sg elements limit
 			 */
-			try_to_copy -= required_size - ctx->sg_encrypted_size;
+			try_to_copy -= required_size - rec->sg_encrypted_size;
 			full_record = true;
 		}
-		if (!is_kvec && (full_record || eor)) {
+
+		if (!is_kvec && (full_record || eor) && !async_capable) {
 			ret = zerocopy_from_iter(sk, &msg->msg_iter,
-				try_to_copy, &ctx->sg_plaintext_num_elem,
-				&ctx->sg_plaintext_size,
-				ctx->sg_plaintext_data,
-				ARRAY_SIZE(ctx->sg_plaintext_data),
+				try_to_copy, &rec->sg_plaintext_num_elem,
+				&rec->sg_plaintext_size,
+				rec->sg_plaintext_data,
+				ARRAY_SIZE(rec->sg_plaintext_data),
 				true);
 			if (ret)
 				goto fallback_to_reg_send;
 
+			num_zc++;
 			copied += try_to_copy;
 			ret = tls_push_record(sk, msg->msg_flags, record_type);
-			if (ret)
-				goto send_end;
+			if (ret) {
+				if (ret == -EINPROGRESS)
+					num_async++;
+				else if (ret != -EAGAIN)
+					goto send_end;
+			}
 			continue;
 
 fallback_to_reg_send:
-			trim_sg(sk, ctx->sg_plaintext_data,
-				&ctx->sg_plaintext_num_elem,
-				&ctx->sg_plaintext_size,
+			trim_sg(sk, rec->sg_plaintext_data,
+				&rec->sg_plaintext_num_elem,
+				&rec->sg_plaintext_size,
 				orig_size);
 		}
 
-		required_size = ctx->sg_plaintext_size + try_to_copy;
+		required_size = rec->sg_plaintext_size + try_to_copy;
 alloc_plaintext:
 		ret = alloc_plaintext_sg(sk, required_size);
 		if (ret) {
@@ -594,13 +813,13 @@ alloc_plaintext:
 			 * actually allocated. The difference is due
 			 * to max sg elements limit
 			 */
-			try_to_copy -= required_size - ctx->sg_plaintext_size;
+			try_to_copy -= required_size - rec->sg_plaintext_size;
 			full_record = true;
 
-			trim_sg(sk, ctx->sg_encrypted_data,
-				&ctx->sg_encrypted_num_elem,
-				&ctx->sg_encrypted_size,
-				ctx->sg_plaintext_size +
+			trim_sg(sk, rec->sg_encrypted_data,
+				&rec->sg_encrypted_num_elem,
+				&rec->sg_encrypted_size,
+				rec->sg_plaintext_size +
 				tls_ctx->tx.overhead_size);
 		}
 
@@ -610,13 +829,12 @@ alloc_plaintext:
 
 		copied += try_to_copy;
 		if (full_record || eor) {
-push_record:
 			ret = tls_push_record(sk, msg->msg_flags, record_type);
 			if (ret) {
-				if (ret == -ENOMEM)
-					goto wait_for_memory;
-
-				goto send_end;
+				if (ret == -EINPROGRESS)
+					num_async++;
+				else if (ret != -EAGAIN)
+					goto send_end;
 			}
 		}
 
@@ -632,15 +850,37 @@ trim_sgl:
 			goto send_end;
 		}
 
-		if (tls_is_pending_closed_record(tls_ctx))
-			goto push_record;
-
-		if (ctx->sg_encrypted_size < required_size)
+		if (rec->sg_encrypted_size < required_size)
 			goto alloc_encrypted;
 
 		goto alloc_plaintext;
 	}
 
+	if (!num_async) {
+		goto send_end;
+	} else if (num_zc) {
+		/* Wait for pending encryptions to get completed */
+		smp_store_mb(ctx->async_notify, true);
+
+		if (atomic_read(&ctx->encrypt_pending))
+			crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
+		else
+			reinit_completion(&ctx->async_wait.completion);
+
+		WRITE_ONCE(ctx->async_notify, false);
+
+		if (ctx->async_wait.err) {
+			ret = ctx->async_wait.err;
+			copied = 0;
+		}
+	}
+
+	/* Transmit if any encryptions have completed */
+	if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) {
+		cancel_delayed_work(&ctx->tx_work.work);
+		tls_tx_records(sk, msg->msg_flags);
+	}
+
 send_end:
 	ret = sk_stream_error(sk, msg->msg_flags, ret);
 
@@ -651,16 +891,18 @@ send_end:
 int tls_sw_sendpage(struct sock *sk, struct page *page,
 		    int offset, size_t size, int flags)
 {
+	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
-	int ret;
-	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
-	bool eor;
-	size_t orig_size = size;
 	unsigned char record_type = TLS_RECORD_TYPE_DATA;
+	size_t orig_size = size;
 	struct scatterlist *sg;
+	struct tls_rec *rec;
+	int num_async = 0;
 	bool full_record;
 	int record_room;
+	bool eor;
+	int ret;
 
 	if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
 		      MSG_SENDPAGE_NOTLAST))
@@ -673,9 +915,12 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 
 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
-	ret = tls_complete_pending_work(sk, tls_ctx, flags, &timeo);
-	if (ret)
-		goto sendpage_end;
+	/* Wait till there is any pending write on socket */
+	if (unlikely(sk->sk_write_pending)) {
+		ret = wait_on_pending_writer(sk, &timeo);
+		if (unlikely(ret))
+			goto sendpage_end;
+	}
 
 	/* Call the sk_stream functions to manage the sndbuf mem. */
 	while (size > 0) {
@@ -686,14 +931,20 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 			goto sendpage_end;
 		}
 
+		rec = get_rec(sk);
+		if (!rec) {
+			ret = -ENOMEM;
+			goto sendpage_end;
+		}
+
 		full_record = false;
-		record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size;
+		record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size;
 		copy = size;
 		if (copy >= record_room) {
 			copy = record_room;
 			full_record = true;
 		}
-		required_size = ctx->sg_plaintext_size + copy +
+		required_size = rec->sg_plaintext_size + copy +
 			      tls_ctx->tx.overhead_size;
 
 		if (!sk_stream_memory_free(sk))
@@ -708,33 +959,32 @@ alloc_payload:
 			 * actually allocated. The difference is due
 			 * to max sg elements limit
 			 */
-			copy -= required_size - ctx->sg_plaintext_size;
+			copy -= required_size - rec->sg_plaintext_size;
 			full_record = true;
 		}
 
 		get_page(page);
-		sg = ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem;
+		sg = rec->sg_plaintext_data + rec->sg_plaintext_num_elem;
 		sg_set_page(sg, page, copy, offset);
 		sg_unmark_end(sg);
 
-		ctx->sg_plaintext_num_elem++;
+		rec->sg_plaintext_num_elem++;
 
 		sk_mem_charge(sk, copy);
 		offset += copy;
 		size -= copy;
-		ctx->sg_plaintext_size += copy;
-		tls_ctx->pending_open_record_frags = ctx->sg_plaintext_num_elem;
+		rec->sg_plaintext_size += copy;
+		tls_ctx->pending_open_record_frags = rec->sg_plaintext_num_elem;
 
 		if (full_record || eor ||
-		    ctx->sg_plaintext_num_elem ==
-		    ARRAY_SIZE(ctx->sg_plaintext_data)) {
-push_record:
+		    rec->sg_plaintext_num_elem ==
+		    ARRAY_SIZE(rec->sg_plaintext_data)) {
 			ret = tls_push_record(sk, flags, record_type);
 			if (ret) {
-				if (ret == -ENOMEM)
-					goto wait_for_memory;
-
-				goto sendpage_end;
+				if (ret == -EINPROGRESS)
+					num_async++;
+				else if (ret != -EAGAIN)
+					goto sendpage_end;
 			}
 		}
 		continue;
@@ -743,16 +993,20 @@ wait_for_sndbuf:
 wait_for_memory:
 		ret = sk_stream_wait_memory(sk, &timeo);
 		if (ret) {
-			trim_both_sgl(sk, ctx->sg_plaintext_size);
+			trim_both_sgl(sk, rec->sg_plaintext_size);
 			goto sendpage_end;
 		}
 
-		if (tls_is_pending_closed_record(tls_ctx))
-			goto push_record;
-
 		goto alloc_payload;
 	}
 
+	if (num_async) {
+		/* Transmit if any encryptions have completed */
+		if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) {
+			cancel_delayed_work(&ctx->tx_work.work);
+			tls_tx_records(sk, flags);
+		}
+	}
 sendpage_end:
 	if (orig_size > size)
 		ret = orig_size - size;
@@ -1300,6 +1554,49 @@ void tls_sw_free_resources_tx(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct tls_rec *rec, *tmp;
+
+	/* Wait for any pending async encryptions to complete */
+	smp_store_mb(ctx->async_notify, true);
+	if (atomic_read(&ctx->encrypt_pending))
+		crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
+
+	cancel_delayed_work_sync(&ctx->tx_work.work);
+
+	/* Tx whatever records we can transmit and abandon the rest */
+	tls_tx_records(sk, -1);
+
+	/* Free up un-sent records in tx_ready_list. First, free
+	 * the partially sent record if any at head of tx_list.
+	 */
+	if (tls_ctx->partially_sent_record) {
+		struct scatterlist *sg = tls_ctx->partially_sent_record;
+
+		while (1) {
+			put_page(sg_page(sg));
+			sk_mem_uncharge(sk, sg->length);
+
+			if (sg_is_last(sg))
+				break;
+			sg++;
+		}
+
+		tls_ctx->partially_sent_record = NULL;
+
+		rec = list_first_entry(&ctx->tx_ready_list,
+				       struct tls_rec, list);
+		list_del(&rec->list);
+		kfree(rec);
+	}
+
+	list_for_each_entry_safe(rec, tmp, &ctx->tx_ready_list, list) {
+		free_sg(sk, rec->sg_encrypted_data,
+			&rec->sg_encrypted_num_elem,
+			&rec->sg_encrypted_size);
+
+		list_del(&rec->list);
+		kfree(rec);
+	}
 
 	crypto_free_aead(ctx->aead_send);
 	tls_free_both_sg(sk);
@@ -1336,6 +1633,24 @@ void tls_sw_free_resources_rx(struct sock *sk)
 	kfree(ctx);
 }
 
+/* The work handler to transmitt the encrypted records in tx_ready_list */
+static void tx_work_handler(struct work_struct *work)
+{
+	struct delayed_work *delayed_work = to_delayed_work(work);
+	struct tx_work *tx_work = container_of(delayed_work,
+					       struct tx_work, work);
+	struct sock *sk = tx_work->sk;
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+
+	if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
+		return;
+
+	lock_sock(sk);
+	tls_tx_records(sk, -1);
+	release_sock(sk);
+}
+
 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 {
 	struct tls_crypto_info *crypto_info;
@@ -1385,6 +1700,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		crypto_info = &ctx->crypto_send.info;
 		cctx = &ctx->tx;
 		aead = &sw_ctx_tx->aead_send;
+		INIT_LIST_HEAD(&sw_ctx_tx->tx_ready_list);
+		INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler);
+		sw_ctx_tx->tx_work.sk = sk;
 	} else {
 		crypto_init_wait(&sw_ctx_rx->async_wait);
 		crypto_info = &ctx->crypto_recv.info;
@@ -1435,26 +1753,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		goto free_iv;
 	}
 
-	if (sw_ctx_tx) {
-		sg_init_table(sw_ctx_tx->sg_encrypted_data,
-			      ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data));
-		sg_init_table(sw_ctx_tx->sg_plaintext_data,
-			      ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data));
-
-		sg_init_table(sw_ctx_tx->sg_aead_in, 2);
-		sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space,
-			   sizeof(sw_ctx_tx->aad_space));
-		sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]);
-		sg_chain(sw_ctx_tx->sg_aead_in, 2,
-			 sw_ctx_tx->sg_plaintext_data);
-		sg_init_table(sw_ctx_tx->sg_aead_out, 2);
-		sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space,
-			   sizeof(sw_ctx_tx->aad_space));
-		sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]);
-		sg_chain(sw_ctx_tx->sg_aead_out, 2,
-			 sw_ctx_tx->sg_encrypted_data);
-	}
-
 	if (!*aead) {
 		*aead = crypto_alloc_aead("gcm(aes)", 0, 0);
 		if (IS_ERR(*aead)) {
@@ -1491,6 +1789,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
 
 		strp_check_rcv(&sw_ctx_rx->strp);
+	} else {
+		ctx->tx_seq_number = be64_to_cpup((const __be64 *)rec_seq);
 	}
 
 	goto out;
-- 
cgit v1.2.3


From 30f8eb55873ef078f5f02f636061d9399debbeab Mon Sep 17 00:00:00 2001
From: Håkon Bugge <Haakon.Bugge@oracle.com>
Date: Fri, 21 Sep 2018 12:39:29 +0200
Subject: net: if_arp: Fix incorrect indents
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixing incorrect indents and align comments.

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_arp.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
index 4605527ca41b..b68b4b3d9172 100644
--- a/include/uapi/linux/if_arp.h
+++ b/include/uapi/linux/if_arp.h
@@ -114,18 +114,18 @@
 
 /* ARP ioctl request. */
 struct arpreq {
-  struct sockaddr	arp_pa;		/* protocol address		*/
-  struct sockaddr	arp_ha;		/* hardware address		*/
-  int			arp_flags;	/* flags			*/
-  struct sockaddr       arp_netmask;    /* netmask (only for proxy arps) */
-  char			arp_dev[16];
+	struct sockaddr	arp_pa;		/* protocol address		 */
+	struct sockaddr	arp_ha;		/* hardware address		 */
+	int		arp_flags;	/* flags			 */
+	struct sockaddr arp_netmask;    /* netmask (only for proxy arps) */
+	char		arp_dev[16];
 };
 
 struct arpreq_old {
-  struct sockaddr	arp_pa;		/* protocol address		*/
-  struct sockaddr	arp_ha;		/* hardware address		*/
-  int			arp_flags;	/* flags			*/
-  struct sockaddr       arp_netmask;    /* netmask (only for proxy arps) */
+	struct sockaddr	arp_pa;		/* protocol address		 */
+	struct sockaddr	arp_ha;		/* hardware address		 */
+	int		arp_flags;	/* flags			 */
+	struct sockaddr	arp_netmask;    /* netmask (only for proxy arps) */
 };
 
 /* ARP Flag values. */
-- 
cgit v1.2.3


From 6a12709da354ea149fdf86c4c9aba5b5033e9cf2 Mon Sep 17 00:00:00 2001
From: Håkon Bugge <Haakon.Bugge@oracle.com>
Date: Fri, 21 Sep 2018 12:39:30 +0200
Subject: net: if_arp: use define instead of hard-coded value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

uapi/linux/if_arp.h includes linux/netdevice.h, which uses
IFNAMSIZ. Hence, use it instead of hard-coded value.

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_arp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
index b68b4b3d9172..c3cc5a9e5eaf 100644
--- a/include/uapi/linux/if_arp.h
+++ b/include/uapi/linux/if_arp.h
@@ -118,7 +118,7 @@ struct arpreq {
 	struct sockaddr	arp_ha;		/* hardware address		 */
 	int		arp_flags;	/* flags			 */
 	struct sockaddr arp_netmask;    /* netmask (only for proxy arps) */
-	char		arp_dev[16];
+	char		arp_dev[IFNAMSIZ];
 };
 
 struct arpreq_old {
-- 
cgit v1.2.3


From 72b0094f918294e6cb8cf5c3b4520d928fbb1a57 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Sep 2018 08:51:46 -0700
Subject: tcp: switch tcp_clock_ns() to CLOCK_TAI base

TCP pacing is either implemented in sch_fq or internally.
We have the goal of being able to offload pacing on the NICS.

TCP will soon provide per skb skb->tstamp as early departure time.

Like ETF in commit 25db26a91364 ("net/sched: Introduce the ETF Qdisc")
we chose CLOCK_T as the clock base, so that TCP and pacers can share
a common clock, to get better RTT samples (without pacing artificially
inflating these samples).

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 770917d0caa7..c6f0bc1dc678 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -732,7 +732,7 @@ void tcp_send_window_probe(struct sock *sk);
 
 static inline u64 tcp_clock_ns(void)
 {
-	return local_clock();
+	return ktime_get_tai_ns();
 }
 
 static inline u64 tcp_clock_us(void)
-- 
cgit v1.2.3


From 2fd66ffba50716fc5ab481c48db643af3bda2276 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Sep 2018 08:51:47 -0700
Subject: tcp: introduce tcp_skb_timestamp_us() helper

There are few places where TCP reads skb->skb_mstamp expecting
a value in usec unit.

skb->tstamp (aka skb->skb_mstamp) will soon store CLOCK_TAI nsec value.

Add tcp_skb_timestamp_us() to provide proper conversion when needed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  8 +++++++-
 net/ipv4/tcp_input.c    | 11 ++++++-----
 net/ipv4/tcp_ipv4.c     |  2 +-
 net/ipv4/tcp_output.c   |  2 +-
 net/ipv4/tcp_rate.c     | 15 ++++++++-------
 net/ipv4/tcp_recovery.c |  5 +++--
 6 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c6f0bc1dc678..0ca5ea10dc06 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -774,6 +774,12 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
 	return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ);
 }
 
+/* provide the departure time in us unit */
+static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
+{
+	return skb->skb_mstamp;
+}
+
 
 #define tcp_flag_byte(th) (((u_int8_t *)th)[13])
 
@@ -1940,7 +1946,7 @@ static inline s64 tcp_rto_delta_us(const struct sock *sk)
 {
 	const struct sk_buff *skb = tcp_rtx_queue_head(sk);
 	u32 rto = inet_csk(sk)->icsk_rto;
-	u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
+	u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto);
 
 	return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d9034073138c..d703a0b3b6a2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1305,7 +1305,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 	 */
 	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
 			start_seq, end_seq, dup_sack, pcount,
-			skb->skb_mstamp);
+			tcp_skb_timestamp_us(skb));
 	tcp_rate_skb_delivered(sk, skb, state->rate);
 
 	if (skb == tp->lost_skb_hint)
@@ -1580,7 +1580,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 						TCP_SKB_CB(skb)->end_seq,
 						dup_sack,
 						tcp_skb_pcount(skb),
-						skb->skb_mstamp);
+						tcp_skb_timestamp_us(skb));
 			tcp_rate_skb_delivered(sk, skb, state->rate);
 			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
 				list_del_init(&skb->tcp_tsorted_anchor);
@@ -3103,7 +3103,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 				tp->retrans_out -= acked_pcount;
 			flag |= FLAG_RETRANS_DATA_ACKED;
 		} else if (!(sacked & TCPCB_SACKED_ACKED)) {
-			last_ackt = skb->skb_mstamp;
+			last_ackt = tcp_skb_timestamp_us(skb);
 			WARN_ON_ONCE(last_ackt == 0);
 			if (!first_ackt)
 				first_ackt = last_ackt;
@@ -3121,7 +3121,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 			tp->delivered += acked_pcount;
 			if (!tcp_skb_spurious_retrans(tp, skb))
 				tcp_rack_advance(tp, sacked, scb->end_seq,
-						 skb->skb_mstamp);
+						 tcp_skb_timestamp_us(skb));
 		}
 		if (sacked & TCPCB_LOST)
 			tp->lost_out -= acked_pcount;
@@ -3215,7 +3215,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
 		}
 	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
-		   sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
+		   sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
+						    tcp_skb_timestamp_us(skb))) {
 		/* Do not re-arm RTO if the sack RTT is measured from data sent
 		 * after when the head was last (re)transmitted. Otherwise the
 		 * timeout may continue to extend in loss recovery.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 09547ef9c4c6..1f2496e8620d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -544,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 		BUG_ON(!skb);
 
 		tcp_mstamp_refresh(tp);
-		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
+		delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 		remaining = icsk->icsk_rto -
 			    usecs_to_jiffies(delta_us);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 597dbd749f05..b95aa72d8823 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1966,7 +1966,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	head = tcp_rtx_queue_head(sk);
 	if (!head)
 		goto send_now;
-	age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
+	age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
 	/* If next ACK is likely to come too late (half srtt), do not defer */
 	if (age < (tp->srtt_us >> 4))
 		goto send_now;
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 4dff40dad4dc..baed2186c7c6 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -55,8 +55,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 	  * bandwidth estimate.
 	  */
 	if (!tp->packets_out) {
-		tp->first_tx_mstamp  = skb->skb_mstamp;
-		tp->delivered_mstamp = skb->skb_mstamp;
+		u64 tstamp_us = tcp_skb_timestamp_us(skb);
+
+		tp->first_tx_mstamp  = tstamp_us;
+		tp->delivered_mstamp = tstamp_us;
 	}
 
 	TCP_SKB_CB(skb)->tx.first_tx_mstamp	= tp->first_tx_mstamp;
@@ -88,13 +90,12 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 		rs->is_app_limited   = scb->tx.is_app_limited;
 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
 
+		/* Record send time of most recently ACKed packet: */
+		tp->first_tx_mstamp  = tcp_skb_timestamp_us(skb);
 		/* Find the duration of the "send phase" of this window: */
-		rs->interval_us      = tcp_stamp_us_delta(
-						skb->skb_mstamp,
-						scb->tx.first_tx_mstamp);
+		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+						     scb->tx.first_tx_mstamp);
 
-		/* Record send time of most recently ACKed packet: */
-		tp->first_tx_mstamp  = skb->skb_mstamp;
 	}
 	/* Mark off the skb delivered once it's sacked to avoid being
 	 * used again when it's cumulatively acked. For acked packets
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index c81aadff769b..fdb715bdd2d1 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -50,7 +50,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
 s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
 {
 	return tp->rack.rtt_us + reo_wnd -
-	       tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+	       tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
 }
 
 /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
@@ -91,7 +91,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 		    !(scb->sacked & TCPCB_SACKED_RETRANS))
 			continue;
 
-		if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
+		if (!tcp_rack_sent_after(tp->rack.mstamp,
+					 tcp_skb_timestamp_us(skb),
 					 tp->rack.end_seq, scb->end_seq))
 			break;
 
-- 
cgit v1.2.3


From 9799ccb0e984a5c1311b22a212e7ff96e8b736de Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Sep 2018 08:51:49 -0700
Subject: tcp: add tcp_wstamp_ns socket field

TCP will soon provide earliest departure time on TX skbs.
It needs to track this in a new variable.

tcp_mstamp_refresh() needs to update this variable, and
became too big to stay an inline.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  2 ++
 include/net/tcp.h     | 12 +-----------
 net/ipv4/tcp_output.c | 16 ++++++++++++++++
 3 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 263e37271afd..848f5b25e178 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -248,6 +248,8 @@ struct tcp_sock {
 		syn_smc:1;	/* SYN includes SMC */
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
+	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
+
 /* RTT measurement */
 	u64	tcp_mstamp;	/* most recent packet received/sent */
 	u32	srtt_us;	/* smoothed round trip time << 3 in usecs */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0ca5ea10dc06..370198fdc65d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -752,17 +752,7 @@ static inline u32 tcp_time_stamp_raw(void)
 	return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ);
 }
 
-
-/* Refresh 1us clock of a TCP socket,
- * ensuring monotically increasing values.
- */
-static inline void tcp_mstamp_refresh(struct tcp_sock *tp)
-{
-	u64 val = tcp_clock_us();
-
-	if (val > tp->tcp_mstamp)
-		tp->tcp_mstamp = val;
-}
+void tcp_mstamp_refresh(struct tcp_sock *tp);
 
 static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
 {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b95aa72d8823..5a8105e84f7c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -45,6 +45,22 @@
 
 #include <trace/events/tcp.h>
 
+/* Refresh clocks of a TCP socket,
+ * ensuring monotically increasing values.
+ */
+void tcp_mstamp_refresh(struct tcp_sock *tp)
+{
+	u64 val = tcp_clock_ns();
+
+	/* departure time for next data packet */
+	if (val > tp->tcp_wstamp_ns)
+		tp->tcp_wstamp_ns = val;
+
+	val = div_u64(val, NSEC_PER_USEC);
+	if (val > tp->tcp_mstamp)
+		tp->tcp_mstamp = val;
+}
+
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
-- 
cgit v1.2.3


From d3edd06ea8ea9e03de6567fda80b8be57e21a537 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Sep 2018 08:51:50 -0700
Subject: tcp: provide earliest departure time in skb->tstamp

Switch internal TCP skb->skb_mstamp to skb->skb_mstamp_ns,
from usec units to nsec units.

Do not clear skb->tstamp before entering IP stacks in TX,
so that qdisc or devices can implement pacing based on the
earliest departure time instead of socket sk->sk_pacing_rate

Packets are fed with tcp_wstamp_ns, and following patch
will update tcp_wstamp_ns when both TCP and sch_fq switch to
the earliest departure time mechanism.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 +-
 include/net/tcp.h      |  6 +++---
 net/ipv4/syncookies.c  |  2 +-
 net/ipv4/tcp.c         |  2 +-
 net/ipv4/tcp_output.c  | 13 ++++++-------
 net/ipv4/tcp_timer.c   |  2 +-
 6 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e3a53ca4a9b5..86f337e9a81d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -689,7 +689,7 @@ struct sk_buff {
 
 	union {
 		ktime_t		tstamp;
-		u64		skb_mstamp;
+		u64		skb_mstamp_ns; /* earliest departure time */
 	};
 	/*
 	 * This is the control buffer. It is free to use for every
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 370198fdc65d..ff15d8e0d525 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -761,13 +761,13 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
 
 static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
 {
-	return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ);
+	return div_u64(skb->skb_mstamp_ns, NSEC_PER_SEC / TCP_TS_HZ);
 }
 
 /* provide the departure time in us unit */
 static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
 {
-	return skb->skb_mstamp;
+	return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
 }
 
 
@@ -813,7 +813,7 @@ struct tcp_skb_cb {
 #define TCPCB_SACKED_RETRANS	0x02	/* SKB retransmitted		*/
 #define TCPCB_LOST		0x04	/* SKB is lost			*/
 #define TCPCB_TAGBITS		0x07	/* All tag bits			*/
-#define TCPCB_REPAIRED		0x10	/* SKB repaired (no skb_mstamp)	*/
+#define TCPCB_REPAIRED		0x10	/* SKB repaired (no skb_mstamp_ns)	*/
 #define TCPCB_EVER_RETRANS	0x80	/* Ever retransmitted frame	*/
 #define TCPCB_RETRANS		(TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
 				TCPCB_REPAIRED)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c3387dfd725b..606f868d9f3f 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -88,7 +88,7 @@ u64 cookie_init_timestamp(struct request_sock *req)
 		ts <<= TSBITS;
 		ts |= options;
 	}
-	return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ);
+	return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
 }
 
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 67670fac7c8d..69c236943f56 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1295,7 +1295,7 @@ new_segment:
 			copy = size_goal;
 
 			/* All packets are restored as if they have
-			 * already been sent. skb_mstamp isn't set to
+			 * already been sent. skb_mstamp_ns isn't set to
 			 * avoid wrong rtt estimation.
 			 */
 			if (tp->repair)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5a8105e84f7c..957f7a0e21c0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1014,7 +1014,7 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
 
 static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
 {
-	skb->skb_mstamp = tp->tcp_mstamp;
+	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
 	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
 }
 
@@ -1061,7 +1061,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		if (unlikely(!skb))
 			return -ENOBUFS;
 	}
-	skb->skb_mstamp = tp->tcp_mstamp;
+	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
 
 	inet = inet_sk(sk);
 	tcb = TCP_SKB_CB(skb);
@@ -1165,8 +1165,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
 	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
 
-	/* Our usage of tstamp should remain private */
-	skb->tstamp = 0;
+	/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
 
 	/* Cleanup our debris for IP stacks */
 	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -3221,10 +3220,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	memset(&opts, 0, sizeof(opts));
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(req->cookie_ts))
-		skb->skb_mstamp = cookie_init_timestamp(req);
+		skb->skb_mstamp_ns = cookie_init_timestamp(req);
 	else
 #endif
-		skb->skb_mstamp = tcp_clock_us();
+		skb->skb_mstamp_ns = tcp_clock_ns();
 
 #ifdef CONFIG_TCP_MD5SIG
 	rcu_read_lock();
@@ -3440,7 +3439,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 
 	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
 
-	syn->skb_mstamp = syn_data->skb_mstamp;
+	syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
 
 	/* Now full SYN+DATA was cloned and sent (or not),
 	 * remove the SYN from the original skb (syn_data)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 7fdf222a0bdf..61023d50cd60 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk)
 	 */
 	start_ts = tcp_skb_timestamp(skb);
 	if (!start_ts)
-		skb->skb_mstamp = tp->tcp_mstamp;
+		skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
 	else if (icsk->icsk_user_timeout &&
 		 (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
 		goto abort;
-- 
cgit v1.2.3


From 5e111210a44301304f9054e995bf33f69b6de76f Mon Sep 17 00:00:00 2001
From: Eelco Chaudron <echaudro@redhat.com>
Date: Fri, 21 Sep 2018 07:13:54 -0400
Subject: net/core: Add new basic hardware counter

Add a new hardware specific basic counter, TCA_STATS_BASIC_HW. This can
be used to count packets/bytes processed by hardware offload.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gen_stats.h        |  4 +++
 include/uapi/linux/gen_stats.h |  1 +
 net/core/gen_stats.c           | 73 +++++++++++++++++++++++++++++++-----------
 3 files changed, 59 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 883bb9085f15..946bd53a9f81 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -44,6 +44,10 @@ void __gnet_stats_copy_basic(const seqcount_t *running,
 			     struct gnet_stats_basic_packed *bstats,
 			     struct gnet_stats_basic_cpu __percpu *cpu,
 			     struct gnet_stats_basic_packed *b);
+int gnet_stats_copy_basic_hw(const seqcount_t *running,
+			     struct gnet_dump *d,
+			     struct gnet_stats_basic_cpu __percpu *cpu,
+			     struct gnet_stats_basic_packed *b);
 int gnet_stats_copy_rate_est(struct gnet_dump *d,
 			     struct net_rate_estimator __rcu **ptr);
 int gnet_stats_copy_queue(struct gnet_dump *d,
diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h
index 24a861c0d29d..065408e16a80 100644
--- a/include/uapi/linux/gen_stats.h
+++ b/include/uapi/linux/gen_stats.h
@@ -12,6 +12,7 @@ enum {
 	TCA_STATS_APP,
 	TCA_STATS_RATE_EST64,
 	TCA_STATS_PAD,
+	TCA_STATS_BASIC_HW,
 	__TCA_STATS_MAX,
 };
 #define TCA_STATS_MAX (__TCA_STATS_MAX - 1)
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 188d693cb251..65a2e820364f 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -162,30 +162,18 @@ __gnet_stats_copy_basic(const seqcount_t *running,
 }
 EXPORT_SYMBOL(__gnet_stats_copy_basic);
 
-/**
- * gnet_stats_copy_basic - copy basic statistics into statistic TLV
- * @running: seqcount_t pointer
- * @d: dumping handle
- * @cpu: copy statistic per cpu
- * @b: basic statistics
- *
- * Appends the basic statistics to the top level TLV created by
- * gnet_stats_start_copy().
- *
- * Returns 0 on success or -1 with the statistic lock released
- * if the room in the socket buffer was not sufficient.
- */
 int
-gnet_stats_copy_basic(const seqcount_t *running,
-		      struct gnet_dump *d,
-		      struct gnet_stats_basic_cpu __percpu *cpu,
-		      struct gnet_stats_basic_packed *b)
+___gnet_stats_copy_basic(const seqcount_t *running,
+			 struct gnet_dump *d,
+			 struct gnet_stats_basic_cpu __percpu *cpu,
+			 struct gnet_stats_basic_packed *b,
+			 int type)
 {
 	struct gnet_stats_basic_packed bstats = {0};
 
 	__gnet_stats_copy_basic(running, &bstats, cpu, b);
 
-	if (d->compat_tc_stats) {
+	if (d->compat_tc_stats && type == TCA_STATS_BASIC) {
 		d->tc_stats.bytes = bstats.bytes;
 		d->tc_stats.packets = bstats.packets;
 	}
@@ -196,13 +184,60 @@ gnet_stats_copy_basic(const seqcount_t *running,
 		memset(&sb, 0, sizeof(sb));
 		sb.bytes = bstats.bytes;
 		sb.packets = bstats.packets;
-		return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb),
+		return gnet_stats_copy(d, type, &sb, sizeof(sb),
 				       TCA_STATS_PAD);
 	}
 	return 0;
 }
+
+/**
+ * gnet_stats_copy_basic - copy basic statistics into statistic TLV
+ * @running: seqcount_t pointer
+ * @d: dumping handle
+ * @cpu: copy statistic per cpu
+ * @b: basic statistics
+ *
+ * Appends the basic statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_basic(const seqcount_t *running,
+		      struct gnet_dump *d,
+		      struct gnet_stats_basic_cpu __percpu *cpu,
+		      struct gnet_stats_basic_packed *b)
+{
+	return ___gnet_stats_copy_basic(running, d, cpu, b,
+					TCA_STATS_BASIC);
+}
 EXPORT_SYMBOL(gnet_stats_copy_basic);
 
+/**
+ * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV
+ * @running: seqcount_t pointer
+ * @d: dumping handle
+ * @cpu: copy statistic per cpu
+ * @b: basic statistics
+ *
+ * Appends the basic statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_basic_hw(const seqcount_t *running,
+			 struct gnet_dump *d,
+			 struct gnet_stats_basic_cpu __percpu *cpu,
+			 struct gnet_stats_basic_packed *b)
+{
+	return ___gnet_stats_copy_basic(running, d, cpu, b,
+					TCA_STATS_BASIC_HW);
+}
+EXPORT_SYMBOL(gnet_stats_copy_basic_hw);
+
 /**
  * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
  * @d: dumping handle
-- 
cgit v1.2.3


From 28169abadb08333eb607621faa3a1dd7109e0d45 Mon Sep 17 00:00:00 2001
From: Eelco Chaudron <echaudro@redhat.com>
Date: Fri, 21 Sep 2018 07:14:02 -0400
Subject: net/sched: Add hardware specific counters to TC actions

Add additional counters that will store the bytes/packets processed by
hardware. These will be exported through the netlink interface for
displaying by the iproute2 tc tool

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h  |  8 +++++---
 include/net/pkt_cls.h  |  2 +-
 net/sched/act_api.c    | 14 +++++++++++---
 net/sched/act_gact.c   |  6 +++++-
 net/sched/act_mirred.c |  5 ++++-
 5 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index c6f195b3c706..1ddff3360592 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -31,10 +31,12 @@ struct tc_action {
 	int				tcfa_action;
 	struct tcf_t			tcfa_tm;
 	struct gnet_stats_basic_packed	tcfa_bstats;
+	struct gnet_stats_basic_packed	tcfa_bstats_hw;
 	struct gnet_stats_queue		tcfa_qstats;
 	struct net_rate_estimator __rcu *tcfa_rate_est;
 	spinlock_t			tcfa_lock;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	struct gnet_stats_basic_cpu __percpu *cpu_bstats_hw;
 	struct gnet_stats_queue __percpu *cpu_qstats;
 	struct tc_cookie	__rcu *act_cookie;
 	struct tcf_chain	*goto_chain;
@@ -94,7 +96,7 @@ struct tc_action_ops {
 			struct netlink_callback *, int,
 			const struct tc_action_ops *,
 			struct netlink_ext_ack *);
-	void	(*stats_update)(struct tc_action *, u64, u32, u64);
+	void	(*stats_update)(struct tc_action *, u64, u32, u64, bool);
 	size_t  (*get_fill_size)(const struct tc_action *act);
 	struct net_device *(*get_dev)(const struct tc_action *a);
 	void	(*put_dev)(struct net_device *dev);
@@ -182,13 +184,13 @@ int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
 #endif /* CONFIG_NET_CLS_ACT */
 
 static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes,
-					   u64 packets, u64 lastuse)
+					   u64 packets, u64 lastuse, bool hw)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	if (!a->ops->stats_update)
 		return;
 
-	a->ops->stats_update(a, bytes, packets, lastuse);
+	a->ops->stats_update(a, bytes, packets, lastuse, hw);
 #endif
 }
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 75a3f3fdb359..bbfe27f86d5f 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -318,7 +318,7 @@ tcf_exts_stats_update(const struct tcf_exts *exts,
 	for (i = 0; i < exts->nr_actions; i++) {
 		struct tc_action *a = exts->actions[i];
 
-		tcf_action_stats_update(a, bytes, packets, lastuse);
+		tcf_action_stats_update(a, bytes, packets, lastuse, true);
 	}
 
 	preempt_enable();
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index fac8c769454f..3c7c23421885 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -81,6 +81,7 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
 static void free_tcf(struct tc_action *p)
 {
 	free_percpu(p->cpu_bstats);
+	free_percpu(p->cpu_bstats_hw);
 	free_percpu(p->cpu_qstats);
 
 	tcf_set_action_cookie(&p->act_cookie, NULL);
@@ -390,9 +391,12 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 		p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
 		if (!p->cpu_bstats)
 			goto err1;
+		p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+		if (!p->cpu_bstats_hw)
+			goto err2;
 		p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
 		if (!p->cpu_qstats)
-			goto err2;
+			goto err3;
 	}
 	spin_lock_init(&p->tcfa_lock);
 	p->tcfa_index = index;
@@ -404,15 +408,17 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 					&p->tcfa_rate_est,
 					&p->tcfa_lock, NULL, est);
 		if (err)
-			goto err3;
+			goto err4;
 	}
 
 	p->idrinfo = idrinfo;
 	p->ops = ops;
 	*a = p;
 	return 0;
-err3:
+err4:
 	free_percpu(p->cpu_qstats);
+err3:
+	free_percpu(p->cpu_bstats_hw);
 err2:
 	free_percpu(p->cpu_bstats);
 err1:
@@ -997,6 +1003,8 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
 		goto errout;
 
 	if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
+	    gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw,
+				     &p->tcfa_bstats_hw) < 0 ||
 	    gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, p->cpu_qstats,
 				  &p->tcfa_qstats,
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index aa44d14b43c7..c89a7fa43d1b 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -157,7 +157,7 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
 }
 
 static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
-				  u64 lastuse)
+				  u64 lastuse, bool hw)
 {
 	struct tcf_gact *gact = to_gact(a);
 	int action = READ_ONCE(gact->tcf_action);
@@ -168,6 +168,10 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 	if (action == TC_ACT_SHOT)
 		this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
 
+	if (hw)
+		_bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw),
+				   bytes, packets);
+
 	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index a9d64bfe5a2a..1dae5f2b358f 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -283,12 +283,15 @@ out:
 }
 
 static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
-			     u64 lastuse)
+			     u64 lastuse, bool hw)
 {
 	struct tcf_mirred *m = to_mirred(a);
 	struct tcf_t *tm = &m->tcf_tm;
 
 	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+	if (hw)
+		_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+				   bytes, packets);
 	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
-- 
cgit v1.2.3


From fc6e8073f304010605f834cb2eb8c07c46461c9d Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Sat, 22 Sep 2018 21:26:20 -0700
Subject: neighbour: send netlink notification if NTF_ROUTER changes

send netlink notification if neigh_update results in NTF_ROUTER
change and if NEIGH_UPDATE_F_ISROUTER is on. Also move the
NTF_ROUTER change function into a helper.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h | 15 +++++++++++++++
 net/core/neighbour.c    |  7 ++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 6c1eecd56a4d..0874f7fcd859 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -544,4 +544,19 @@ static inline void neigh_update_ext_learned(struct neighbour *neigh, u32 flags,
 		*notify = 1;
 	}
 }
+
+static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags,
+					  int *notify)
+{
+	u8 ndm_flags = 0;
+
+	ndm_flags |= (flags & NEIGH_UPDATE_F_ISROUTER) ? NTF_ROUTER : 0;
+	if ((neigh->flags ^ ndm_flags) & NTF_ROUTER) {
+		if (ndm_flags & NTF_ROUTER)
+			neigh->flags |= NTF_ROUTER;
+		else
+			neigh->flags &= ~NTF_ROUTER;
+		*notify = 1;
+	}
+}
 #endif
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 0e1cad89184f..20e0d3308148 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1280,11 +1280,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 		neigh->arp_queue_len_bytes = 0;
 	}
 out:
-	if (update_isrouter) {
-		neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
-			(neigh->flags | NTF_ROUTER) :
-			(neigh->flags & ~NTF_ROUTER);
-	}
+	if (update_isrouter)
+		neigh_update_is_router(neigh, flags, &notify);
 	write_unlock_bh(&neigh->lock);
 
 	if (notify)
-- 
cgit v1.2.3


From 9932a29ab1be1427a2ccbdf852a0f131f2849685 Mon Sep 17 00:00:00 2001
From: Vakul Garg <vakul.garg@nxp.com>
Date: Mon, 24 Sep 2018 15:35:56 +0530
Subject: net/tls: Fixed race condition in async encryption

On processors with multi-engine crypto accelerators, it is possible that
multiple records get encrypted in parallel and their encryption
completion is notified to different cpus in multicore processor. This
leads to the situation where tls_encrypt_done() starts executing in
parallel on different cores. In current implementation, encrypted
records are queued to tx_ready_list in tls_encrypt_done(). This requires
addition to linked list 'tx_ready_list' to be protected. As
tls_decrypt_done() could be executing in irq content, it is not possible
to protect linked list addition operation using a lock.

To fix the problem, we remove linked list addition operation from the
irq context. We do tx_ready_list addition/removal operation from
application context only and get rid of possible multiple access to
the linked list. Before starting encryption on the record, we add it to
the tail of tx_ready_list. To prevent tls_tx_records() from transmitting
it, we mark the record with a new flag 'tx_ready' in 'struct tls_rec'.
When record encryption gets completed, tls_encrypt_done() has to only
update the 'tx_ready' flag to true & linked list add operation is not
required.

The changed logic brings some other side benefits. Since the records
are always submitted in tls sequence number order for encryption, the
tx_ready_list always remains sorted and addition of new records to it
does not have to traverse the linked list.

Lastly, we renamed tx_ready_list in 'struct tls_sw_context_tx' to
'tx_list'. This is because now, the some of the records at the tail are
not ready to transmit.

Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption")
Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h  | 16 ++++-------
 net/tls/tls_main.c |  4 +--
 net/tls/tls_sw.c   | 81 ++++++++++++++++++++----------------------------------
 3 files changed, 37 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 3aa73e2d8823..1615fb5ea114 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -99,6 +99,7 @@ enum {
  */
 struct tls_rec {
 	struct list_head list;
+	int tx_ready;
 	int tx_flags;
 	struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS];
 	struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS];
@@ -128,7 +129,7 @@ struct tls_sw_context_tx {
 	struct crypto_wait async_wait;
 	struct tx_work tx_work;
 	struct tls_rec *open_rec;
-	struct list_head tx_ready_list;
+	struct list_head tx_list;
 	atomic_t encrypt_pending;
 	int async_notify;
 
@@ -220,7 +221,6 @@ struct tls_context {
 
 	struct scatterlist *partially_sent_record;
 	u16 partially_sent_offset;
-	u64 tx_seq_number;	/* Next TLS seqnum to be transmitted */
 
 	unsigned long flags;
 	bool in_tcp_sendpages;
@@ -341,21 +341,15 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx)
 	return tls_ctx->pending_open_record_frags;
 }
 
-static inline bool is_tx_ready(struct tls_context *tls_ctx,
-			       struct tls_sw_context_tx *ctx)
+static inline bool is_tx_ready(struct tls_sw_context_tx *ctx)
 {
 	struct tls_rec *rec;
-	u64 seq;
 
-	rec = list_first_entry(&ctx->tx_ready_list, struct tls_rec, list);
+	rec = list_first_entry(&ctx->tx_list, struct tls_rec, list);
 	if (!rec)
 		return false;
 
-	seq = be64_to_cpup((const __be64 *)&rec->aad_space);
-	if (seq == tls_ctx->tx_seq_number)
-		return true;
-	else
-		return false;
+	return READ_ONCE(rec->tx_ready);
 }
 
 struct sk_buff *
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 06094de7a3d9..b428069a1b05 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -212,7 +212,7 @@ int tls_push_pending_closed_record(struct sock *sk,
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 
 	if (tls_is_partially_sent_record(tls_ctx) ||
-	    !list_empty(&ctx->tx_ready_list))
+	    !list_empty(&ctx->tx_list))
 		return tls_tx_records(sk, flags);
 	else
 		return tls_ctx->push_pending_record(sk, flags);
@@ -233,7 +233,7 @@ static void tls_write_space(struct sock *sk)
 	}
 
 	/* Schedule the transmission if tx list is ready */
-	if (is_tx_ready(ctx, tx_ctx) && !sk->sk_write_pending) {
+	if (is_tx_ready(tx_ctx) && !sk->sk_write_pending) {
 		/* Schedule the transmission */
 		if (!test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask))
 			schedule_delayed_work(&tx_ctx->tx_work.work, 0);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index bcb24c498b84..d30d65bf0753 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -329,29 +329,6 @@ static void tls_free_both_sg(struct sock *sk)
 		&rec->sg_plaintext_size);
 }
 
-static bool append_tx_ready_list(struct tls_context *tls_ctx,
-				 struct tls_sw_context_tx *ctx,
-				 struct tls_rec *enc_rec)
-{
-	u64 new_seq = be64_to_cpup((const __be64 *)&enc_rec->aad_space);
-	struct list_head *pos;
-
-	/* Need to insert encrypted record in tx_ready_list sorted
-	 * as per sequence number. Traverse linked list from tail.
-	 */
-	list_for_each_prev(pos, &ctx->tx_ready_list) {
-		struct tls_rec *rec = (struct tls_rec *)pos;
-		u64 seq = be64_to_cpup((const __be64 *)&rec->aad_space);
-
-		if (new_seq > seq)
-			break;
-	}
-
-	list_add((struct list_head *)&enc_rec->list, pos);
-
-	return is_tx_ready(tls_ctx, ctx);
-}
-
 int tls_tx_records(struct sock *sk, int flags)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
@@ -360,7 +337,7 @@ int tls_tx_records(struct sock *sk, int flags)
 	int tx_flags, rc = 0;
 
 	if (tls_is_partially_sent_record(tls_ctx)) {
-		rec = list_first_entry(&ctx->tx_ready_list,
+		rec = list_first_entry(&ctx->tx_list,
 				       struct tls_rec, list);
 
 		if (flags == -1)
@@ -373,18 +350,15 @@ int tls_tx_records(struct sock *sk, int flags)
 			goto tx_err;
 
 		/* Full record has been transmitted.
-		 * Remove the head of tx_ready_list
+		 * Remove the head of tx_list
 		 */
-		tls_ctx->tx_seq_number++;
 		list_del(&rec->list);
 		kfree(rec);
 	}
 
-	/* Tx all ready records which have expected sequence number */
-	list_for_each_entry_safe(rec, tmp, &ctx->tx_ready_list, list) {
-		u64 seq = be64_to_cpup((const __be64 *)&rec->aad_space);
-
-		if (seq == tls_ctx->tx_seq_number) {
+	/* Tx all ready records */
+	list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) {
+		if (READ_ONCE(rec->tx_ready)) {
 			if (flags == -1)
 				tx_flags = rec->tx_flags;
 			else
@@ -396,7 +370,6 @@ int tls_tx_records(struct sock *sk, int flags)
 			if (rc)
 				goto tx_err;
 
-			tls_ctx->tx_seq_number++;
 			list_del(&rec->list);
 			kfree(rec);
 		} else {
@@ -446,9 +419,18 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
 		}
 	}
 
-	/* Append the record in tx queue */
-	if (rec)
-		ready = append_tx_ready_list(tls_ctx, ctx, rec);
+	if (rec) {
+		struct tls_rec *first_rec;
+
+		/* Mark the record as ready for transmission */
+		smp_store_mb(rec->tx_ready, true);
+
+		/* If received record is at head of tx_list, schedule tx */
+		first_rec = list_first_entry(&ctx->tx_list,
+					     struct tls_rec, list);
+		if (rec == first_rec)
+			ready = true;
+	}
 
 	pending = atomic_dec_return(&ctx->encrypt_pending);
 
@@ -484,6 +466,8 @@ static int tls_do_encryption(struct sock *sk,
 	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				  tls_encrypt_done, sk);
 
+	/* Add the record in tx_list */
+	list_add_tail((struct list_head *)&rec->list, &ctx->tx_list);
 	atomic_inc(&ctx->encrypt_pending);
 
 	rc = crypto_aead_encrypt(aead_req);
@@ -493,9 +477,12 @@ static int tls_do_encryption(struct sock *sk,
 		rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size;
 	}
 
-	/* Case of encryption failure */
-	if (rc && rc != -EINPROGRESS)
+	if (!rc) {
+		WRITE_ONCE(rec->tx_ready, true);
+	} else if (rc != -EINPROGRESS) {
+		list_del(&rec->list);
 		return rc;
+	}
 
 	/* Unhook the record from context if encryption is not failure */
 	ctx->open_rec = NULL;
@@ -544,13 +531,7 @@ static int tls_push_record(struct sock *sk, int flags,
 		return rc;
 	}
 
-	/* Put the record in tx_ready_list and start tx if permitted.
-	 * This happens only when encryption is not asynchronous.
-	 */
-	if (append_tx_ready_list(tls_ctx, ctx, rec))
-		return tls_tx_records(sk, flags);
-
-	return 0;
+	return tls_tx_records(sk, flags);
 }
 
 static int tls_sw_push_pending_record(struct sock *sk, int flags)
@@ -1566,7 +1547,7 @@ void tls_sw_free_resources_tx(struct sock *sk)
 	/* Tx whatever records we can transmit and abandon the rest */
 	tls_tx_records(sk, -1);
 
-	/* Free up un-sent records in tx_ready_list. First, free
+	/* Free up un-sent records in tx_list. First, free
 	 * the partially sent record if any at head of tx_list.
 	 */
 	if (tls_ctx->partially_sent_record) {
@@ -1583,13 +1564,13 @@ void tls_sw_free_resources_tx(struct sock *sk)
 
 		tls_ctx->partially_sent_record = NULL;
 
-		rec = list_first_entry(&ctx->tx_ready_list,
+		rec = list_first_entry(&ctx->tx_list,
 				       struct tls_rec, list);
 		list_del(&rec->list);
 		kfree(rec);
 	}
 
-	list_for_each_entry_safe(rec, tmp, &ctx->tx_ready_list, list) {
+	list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) {
 		free_sg(sk, rec->sg_encrypted_data,
 			&rec->sg_encrypted_num_elem,
 			&rec->sg_encrypted_size);
@@ -1633,7 +1614,7 @@ void tls_sw_free_resources_rx(struct sock *sk)
 	kfree(ctx);
 }
 
-/* The work handler to transmitt the encrypted records in tx_ready_list */
+/* The work handler to transmitt the encrypted records in tx_list */
 static void tx_work_handler(struct work_struct *work)
 {
 	struct delayed_work *delayed_work = to_delayed_work(work);
@@ -1700,7 +1681,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		crypto_info = &ctx->crypto_send.info;
 		cctx = &ctx->tx;
 		aead = &sw_ctx_tx->aead_send;
-		INIT_LIST_HEAD(&sw_ctx_tx->tx_ready_list);
+		INIT_LIST_HEAD(&sw_ctx_tx->tx_list);
 		INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler);
 		sw_ctx_tx->tx_work.sk = sk;
 	} else {
@@ -1789,8 +1770,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
 
 		strp_check_rcv(&sw_ctx_rx->strp);
-	} else {
-		ctx->tx_seq_number = be64_to_cpup((const __be64 *)rec_seq);
 	}
 
 	goto out;
-- 
cgit v1.2.3


From 9ba481e2eb3b932ae5b6278342b256e4f92d2793 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 20 Sep 2018 21:35:20 +0300
Subject: net/mlx5: Set uid as part of CQ commands

Set uid as part of CQ commands so that the firmware can manage the CQ
object in a secured way.

The firmware should mark this CQ with the given uid so that it can
be used later on only by objects with the same uid.

Upon DEVX flows that use this CQ (e.g. create QP command), the
pointed CQ must have the same uid as of the issuer uid command.

When a command is issued with uid=0 it means that the issuer of the
command is trusted (i.e. kernel), in that case any pointed object
can be used regardless of its uid.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cq.c | 4 ++++
 include/linux/mlx5/cq.h                      | 1 +
 include/linux/mlx5/mlx5_ifc.h                | 6 +++---
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index a4179122a279..4b85abb5c9f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -109,6 +109,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 	cq->cons_index = 0;
 	cq->arm_sn     = 0;
 	cq->eq         = eq;
+	cq->uid = MLX5_GET(create_cq_in, in, uid);
 	refcount_set(&cq->refcount, 1);
 	init_completion(&cq->free);
 	if (!cq->comp)
@@ -144,6 +145,7 @@ err_cmd:
 	memset(dout, 0, sizeof(dout));
 	MLX5_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ);
 	MLX5_SET(destroy_cq_in, din, cqn, cq->cqn);
+	MLX5_SET(destroy_cq_in, din, uid, cq->uid);
 	mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
 	return err;
 }
@@ -165,6 +167,7 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
 
 	MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ);
 	MLX5_SET(destroy_cq_in, in, cqn, cq->cqn);
+	MLX5_SET(destroy_cq_in, in, uid, cq->uid);
 	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 	if (err)
 		return err;
@@ -196,6 +199,7 @@ int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 	u32 out[MLX5_ST_SZ_DW(modify_cq_out)] = {0};
 
 	MLX5_SET(modify_cq_in, in, opcode, MLX5_CMD_OP_MODIFY_CQ);
+	MLX5_SET(modify_cq_in, in, uid, cq->uid);
 	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
 }
 EXPORT_SYMBOL(mlx5_core_modify_cq);
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 0ef6138eca49..31a750570c38 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -61,6 +61,7 @@ struct mlx5_core_cq {
 	int			reset_notify_added;
 	struct list_head	reset_notify;
 	struct mlx5_eq		*eq;
+	u16 uid;
 };
 
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4c7a1d25d73b..1a412b355054 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5629,7 +5629,7 @@ enum {
 
 struct mlx5_ifc_modify_cq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6404,7 +6404,7 @@ struct mlx5_ifc_destroy_cq_out_bits {
 
 struct mlx5_ifc_destroy_cq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -7164,7 +7164,7 @@ struct mlx5_ifc_create_cq_out_bits {
 
 struct mlx5_ifc_create_cq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
-- 
cgit v1.2.3


From 4ac63ec72587f7426aae15ddfe78e8ab785724dc Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 20 Sep 2018 21:35:21 +0300
Subject: net/mlx5: Set uid as part of QP commands

Set uid as part of QP commands so that the firmware can manage the
QP object in a secured way.

That will enable using a QP that was created by verbs application to
be used by the DEVX flow in case the uid is equal.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 45 +++++++++++++++++-----------
 include/linux/mlx5/mlx5_ifc.h                | 22 +++++++-------
 include/linux/mlx5/qp.h                      |  1 +
 3 files changed, 39 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index 4ca07bfb6b14..4e2ab3c916bf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -240,6 +240,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
 	if (err)
 		return err;
 
+	qp->uid = MLX5_GET(create_qp_in, in, uid);
 	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
 	mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn);
 
@@ -261,6 +262,7 @@ err_cmd:
 	memset(dout, 0, sizeof(dout));
 	MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
 	MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
+	MLX5_SET(destroy_qp_in, din, uid, qp->uid);
 	mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
 	return err;
 }
@@ -320,6 +322,7 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
 
 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
 	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
+	MLX5_SET(destroy_qp_in, in, uid, qp->uid);
 	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 	if (err)
 		return err;
@@ -373,7 +376,7 @@ static void mbox_free(struct mbox_info *mbox)
 
 static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn,
 				u32 opt_param_mask, void *qpc,
-				struct mbox_info *mbox)
+				struct mbox_info *mbox, u16 uid)
 {
 	mbox->out = NULL;
 	mbox->in = NULL;
@@ -381,26 +384,32 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn,
 #define MBOX_ALLOC(mbox, typ)  \
 	mbox_alloc(mbox, MLX5_ST_SZ_BYTES(typ##_in), MLX5_ST_SZ_BYTES(typ##_out))
 
-#define MOD_QP_IN_SET(typ, in, _opcode, _qpn) \
-	MLX5_SET(typ##_in, in, opcode, _opcode); \
-	MLX5_SET(typ##_in, in, qpn, _qpn)
-
-#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc) \
-	MOD_QP_IN_SET(typ, in, _opcode, _qpn); \
-	MLX5_SET(typ##_in, in, opt_param_mask, _opt_p); \
-	memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc, MLX5_ST_SZ_BYTES(qpc))
+#define MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid)                            \
+	do {                                                                   \
+		MLX5_SET(typ##_in, in, opcode, _opcode);                       \
+		MLX5_SET(typ##_in, in, qpn, _qpn);                             \
+		MLX5_SET(typ##_in, in, uid, _uid);                             \
+	} while (0)
+
+#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc, _uid)          \
+	do {                                                                   \
+		MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid);                   \
+		MLX5_SET(typ##_in, in, opt_param_mask, _opt_p);                \
+		memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc,                  \
+		       MLX5_ST_SZ_BYTES(qpc));                                 \
+	} while (0)
 
 	switch (opcode) {
 	/* 2RST & 2ERR */
 	case MLX5_CMD_OP_2RST_QP:
 		if (MBOX_ALLOC(mbox, qp_2rst))
 			return -ENOMEM;
-		MOD_QP_IN_SET(qp_2rst, mbox->in, opcode, qpn);
+		MOD_QP_IN_SET(qp_2rst, mbox->in, opcode, qpn, uid);
 		break;
 	case MLX5_CMD_OP_2ERR_QP:
 		if (MBOX_ALLOC(mbox, qp_2err))
 			return -ENOMEM;
-		MOD_QP_IN_SET(qp_2err, mbox->in, opcode, qpn);
+		MOD_QP_IN_SET(qp_2err, mbox->in, opcode, qpn, uid);
 		break;
 
 	/* MODIFY with QPC */
@@ -408,37 +417,37 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn,
 		if (MBOX_ALLOC(mbox, rst2init_qp))
 			return -ENOMEM;
 		MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn,
-				  opt_param_mask, qpc);
+				  opt_param_mask, qpc, uid);
 		break;
 	case MLX5_CMD_OP_INIT2RTR_QP:
 		if (MBOX_ALLOC(mbox, init2rtr_qp))
 			return -ENOMEM;
 		MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn,
-				  opt_param_mask, qpc);
+				  opt_param_mask, qpc, uid);
 		break;
 	case MLX5_CMD_OP_RTR2RTS_QP:
 		if (MBOX_ALLOC(mbox, rtr2rts_qp))
 			return -ENOMEM;
 		MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn,
-				  opt_param_mask, qpc);
+				  opt_param_mask, qpc, uid);
 		break;
 	case MLX5_CMD_OP_RTS2RTS_QP:
 		if (MBOX_ALLOC(mbox, rts2rts_qp))
 			return -ENOMEM;
 		MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn,
-				  opt_param_mask, qpc);
+				  opt_param_mask, qpc, uid);
 		break;
 	case MLX5_CMD_OP_SQERR2RTS_QP:
 		if (MBOX_ALLOC(mbox, sqerr2rts_qp))
 			return -ENOMEM;
 		MOD_QP_IN_SET_QPC(sqerr2rts_qp, mbox->in, opcode, qpn,
-				  opt_param_mask, qpc);
+				  opt_param_mask, qpc, uid);
 		break;
 	case MLX5_CMD_OP_INIT2INIT_QP:
 		if (MBOX_ALLOC(mbox, init2init_qp))
 			return -ENOMEM;
 		MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn,
-				  opt_param_mask, qpc);
+				  opt_param_mask, qpc, uid);
 		break;
 	default:
 		mlx5_core_err(dev, "Unknown transition for modify QP: OP(0x%x) QPN(0x%x)\n",
@@ -456,7 +465,7 @@ int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode,
 	int err;
 
 	err = modify_qp_mbox_alloc(dev, opcode, qp->qpn,
-				   opt_param_mask, qpc, &mbox);
+				   opt_param_mask, qpc, &mbox, qp->uid);
 	if (err)
 		return err;
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1a412b355054..3b6a612787ac 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -3394,7 +3394,7 @@ struct mlx5_ifc_sqerr2rts_qp_out_bits {
 
 struct mlx5_ifc_sqerr2rts_qp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -3424,7 +3424,7 @@ struct mlx5_ifc_sqd2rts_qp_out_bits {
 
 struct mlx5_ifc_sqd2rts_qp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -3629,7 +3629,7 @@ struct mlx5_ifc_rts2rts_qp_out_bits {
 
 struct mlx5_ifc_rts2rts_qp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -3659,7 +3659,7 @@ struct mlx5_ifc_rtr2rts_qp_out_bits {
 
 struct mlx5_ifc_rtr2rts_qp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -3689,7 +3689,7 @@ struct mlx5_ifc_rst2init_qp_out_bits {
 
 struct mlx5_ifc_rst2init_qp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -5192,7 +5192,7 @@ struct mlx5_ifc_qp_2rst_out_bits {
 
 struct mlx5_ifc_qp_2rst_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -5214,7 +5214,7 @@ struct mlx5_ifc_qp_2err_out_bits {
 
 struct mlx5_ifc_qp_2err_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -5789,7 +5789,7 @@ struct mlx5_ifc_init2rtr_qp_out_bits {
 
 struct mlx5_ifc_init2rtr_qp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -5819,7 +5819,7 @@ struct mlx5_ifc_init2init_qp_out_bits {
 
 struct mlx5_ifc_init2init_qp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6230,7 +6230,7 @@ struct mlx5_ifc_destroy_qp_out_bits {
 
 struct mlx5_ifc_destroy_qp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6895,7 +6895,7 @@ struct mlx5_ifc_create_qp_out_bits {
 
 struct mlx5_ifc_create_qp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 4778d41085d4..fbe322c966bc 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -471,6 +471,7 @@ struct mlx5_core_qp {
 	int			qpn;
 	struct mlx5_rsc_debug	*dbg;
 	int			pid;
+	u16			uid;
 };
 
 struct mlx5_core_dct {
-- 
cgit v1.2.3


From d269b3afffcb107375d9cf73127fc2e0181bc90b Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 20 Sep 2018 21:35:22 +0300
Subject: net/mlx5: Set uid as part of RQ commands

Set uid as part of RQ commands so that the firmware can manage the
RQ object in a secured way.

That will enable using an RQ that was created by verbs application
to be used by the DEVX flow in case the uid is equal.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 16 ++++++++++++++--
 include/linux/mlx5/mlx5_ifc.h                |  6 +++---
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index 4e2ab3c916bf..f57e08d4f970 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -540,6 +540,17 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn)
 }
 EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
 
+static void destroy_rq_tracked(struct mlx5_core_dev *dev, u32 rqn, u16 uid)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rq_in)]   = {};
+	u32 out[MLX5_ST_SZ_DW(destroy_rq_out)] = {};
+
+	MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ);
+	MLX5_SET(destroy_rq_in, in, rqn, rqn);
+	MLX5_SET(destroy_rq_in, in, uid, uid);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
 int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 				struct mlx5_core_qp *rq)
 {
@@ -550,6 +561,7 @@ int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 	if (err)
 		return err;
 
+	rq->uid = MLX5_GET(create_rq_in, in, uid);
 	rq->qpn = rqn;
 	err = create_resource_common(dev, rq, MLX5_RES_RQ);
 	if (err)
@@ -558,7 +570,7 @@ int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 	return 0;
 
 err_destroy_rq:
-	mlx5_core_destroy_rq(dev, rq->qpn);
+	destroy_rq_tracked(dev, rq->qpn, rq->uid);
 
 	return err;
 }
@@ -568,7 +580,7 @@ void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
 				  struct mlx5_core_qp *rq)
 {
 	destroy_resource_common(dev, rq);
-	mlx5_core_destroy_rq(dev, rq->qpn);
+	destroy_rq_tracked(dev, rq->qpn, rq->uid);
 }
 EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3b6a612787ac..689631ca27b8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5488,7 +5488,7 @@ enum {
 
 struct mlx5_ifc_modify_rq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6164,7 +6164,7 @@ struct mlx5_ifc_destroy_rq_out_bits {
 
 struct mlx5_ifc_destroy_rq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6847,7 +6847,7 @@ struct mlx5_ifc_create_rq_out_bits {
 
 struct mlx5_ifc_create_rq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
-- 
cgit v1.2.3


From 430ae0d5a3ce1350375690cb6ab29ab6fae4a9ac Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 20 Sep 2018 21:35:23 +0300
Subject: net/mlx5: Set uid as part of SQ commands

Set uid as part of SQ commands so that the firmware can manage the
SQ object in a secured way.

That will enable using an SQ that was created by verbs application
to be used by the DEVX flow in case the uid is equal.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 16 ++++++++++++++--
 include/linux/mlx5/mlx5_ifc.h                |  6 +++---
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index f57e08d4f970..d9b12136cbfd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -584,6 +584,17 @@ void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked);
 
+static void destroy_sq_tracked(struct mlx5_core_dev *dev, u32 sqn, u16 uid)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_sq_in)]   = {};
+	u32 out[MLX5_ST_SZ_DW(destroy_sq_out)] = {};
+
+	MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ);
+	MLX5_SET(destroy_sq_in, in, sqn, sqn);
+	MLX5_SET(destroy_sq_in, in, uid, uid);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
 int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 				struct mlx5_core_qp *sq)
 {
@@ -594,6 +605,7 @@ int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 	if (err)
 		return err;
 
+	sq->uid = MLX5_GET(create_sq_in, in, uid);
 	sq->qpn = sqn;
 	err = create_resource_common(dev, sq, MLX5_RES_SQ);
 	if (err)
@@ -602,7 +614,7 @@ int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 	return 0;
 
 err_destroy_sq:
-	mlx5_core_destroy_sq(dev, sq->qpn);
+	destroy_sq_tracked(dev, sq->qpn, sq->uid);
 
 	return err;
 }
@@ -612,7 +624,7 @@ void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
 				  struct mlx5_core_qp *sq)
 {
 	destroy_resource_common(dev, sq);
-	mlx5_core_destroy_sq(dev, sq->qpn);
+	destroy_sq_tracked(dev, sq->qpn, sq->uid);
 }
 EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 689631ca27b8..72dd1e49a799 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5381,7 +5381,7 @@ struct mlx5_ifc_modify_sq_out_bits {
 
 struct mlx5_ifc_modify_sq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6096,7 +6096,7 @@ struct mlx5_ifc_destroy_sq_out_bits {
 
 struct mlx5_ifc_destroy_sq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6769,7 +6769,7 @@ struct mlx5_ifc_create_sq_out_bits {
 
 struct mlx5_ifc_create_sq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
-- 
cgit v1.2.3


From a0d8c054318976927493ffd26055d9d183c9beec Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 20 Sep 2018 21:35:24 +0300
Subject: net/mlx5: Set uid as part of SRQ commands

Set uid as part of SRQ commands so that the firmware can manage the
SRQ object in a secured way.

That will enable using an SRQ that was created by verbs application
to be used by the DEVX flow in case the uid is equal.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/srq.c | 30 ++++++++++++++++++++++++---
 include/linux/mlx5/driver.h                   |  1 +
 include/linux/mlx5/mlx5_ifc.h                 | 22 ++++++++++----------
 include/linux/mlx5/srq.h                      |  1 +
 4 files changed, 40 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
index 23cc337a96c9..5c519615fb1c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
@@ -166,6 +166,7 @@ static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 	if (!create_in)
 		return -ENOMEM;
 
+	MLX5_SET(create_srq_in, create_in, uid, in->uid);
 	srqc = MLX5_ADDR_OF(create_srq_in, create_in, srq_context_entry);
 	pas = MLX5_ADDR_OF(create_srq_in, create_in, pas);
 
@@ -178,8 +179,10 @@ static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 	err = mlx5_cmd_exec(dev, create_in, inlen, create_out,
 			    sizeof(create_out));
 	kvfree(create_in);
-	if (!err)
+	if (!err) {
 		srq->srqn = MLX5_GET(create_srq_out, create_out, srqn);
+		srq->uid = in->uid;
+	}
 
 	return err;
 }
@@ -193,6 +196,7 @@ static int destroy_srq_cmd(struct mlx5_core_dev *dev,
 	MLX5_SET(destroy_srq_in, srq_in, opcode,
 		 MLX5_CMD_OP_DESTROY_SRQ);
 	MLX5_SET(destroy_srq_in, srq_in, srqn, srq->srqn);
+	MLX5_SET(destroy_srq_in, srq_in, uid, srq->uid);
 
 	return mlx5_cmd_exec(dev, srq_in, sizeof(srq_in),
 			     srq_out, sizeof(srq_out));
@@ -208,6 +212,7 @@ static int arm_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 	MLX5_SET(arm_rq_in, srq_in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_SRQ);
 	MLX5_SET(arm_rq_in, srq_in, srq_number, srq->srqn);
 	MLX5_SET(arm_rq_in, srq_in, lwm,      lwm);
+	MLX5_SET(arm_rq_in, srq_in, uid, srq->uid);
 
 	return  mlx5_cmd_exec(dev, srq_in, sizeof(srq_in),
 			      srq_out, sizeof(srq_out));
@@ -260,6 +265,7 @@ static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
 	if (!create_in)
 		return -ENOMEM;
 
+	MLX5_SET(create_xrc_srq_in, create_in, uid, in->uid);
 	xrc_srqc = MLX5_ADDR_OF(create_xrc_srq_in, create_in,
 				xrc_srq_context_entry);
 	pas	 = MLX5_ADDR_OF(create_xrc_srq_in, create_in, pas);
@@ -277,6 +283,7 @@ static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
 		goto out;
 
 	srq->srqn = MLX5_GET(create_xrc_srq_out, create_out, xrc_srqn);
+	srq->uid = in->uid;
 out:
 	kvfree(create_in);
 	return err;
@@ -291,6 +298,7 @@ static int destroy_xrc_srq_cmd(struct mlx5_core_dev *dev,
 	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, opcode,
 		 MLX5_CMD_OP_DESTROY_XRC_SRQ);
 	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, uid, srq->uid);
 
 	return mlx5_cmd_exec(dev, xrcsrq_in, sizeof(xrcsrq_in),
 			     xrcsrq_out, sizeof(xrcsrq_out));
@@ -306,6 +314,7 @@ static int arm_xrc_srq_cmd(struct mlx5_core_dev *dev,
 	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, op_mod,   MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ);
 	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
 	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, lwm,      lwm);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, uid, srq->uid);
 
 	return  mlx5_cmd_exec(dev, xrcsrq_in, sizeof(xrcsrq_in),
 			      xrcsrq_out, sizeof(xrcsrq_out));
@@ -365,10 +374,13 @@ static int create_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 	wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
 
 	MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+	MLX5_SET(create_rmp_in, create_in, uid, in->uid);
 	set_wq(wq, in);
 	memcpy(MLX5_ADDR_OF(rmpc, rmpc, wq.pas), in->pas, pas_size);
 
 	err = mlx5_core_create_rmp(dev, create_in, inlen, &srq->srqn);
+	if (!err)
+		srq->uid = in->uid;
 
 	kvfree(create_in);
 	return err;
@@ -377,7 +389,13 @@ static int create_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 static int destroy_rmp_cmd(struct mlx5_core_dev *dev,
 			   struct mlx5_core_srq *srq)
 {
-	return mlx5_core_destroy_rmp(dev, srq->srqn);
+	u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)]   = {};
+	u32 out[MLX5_ST_SZ_DW(destroy_rmp_out)] = {};
+
+	MLX5_SET(destroy_rmp_in, in, opcode, MLX5_CMD_OP_DESTROY_RMP);
+	MLX5_SET(destroy_rmp_in, in, rmpn, srq->srqn);
+	MLX5_SET(destroy_rmp_in, in, uid, srq->uid);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
 static int arm_rmp_cmd(struct mlx5_core_dev *dev,
@@ -400,6 +418,7 @@ static int arm_rmp_cmd(struct mlx5_core_dev *dev,
 
 	MLX5_SET(modify_rmp_in, in,	 rmp_state, MLX5_RMPC_STATE_RDY);
 	MLX5_SET(modify_rmp_in, in,	 rmpn,      srq->srqn);
+	MLX5_SET(modify_rmp_in, in, uid, srq->uid);
 	MLX5_SET(wq,		wq,	 lwm,	    lwm);
 	MLX5_SET(rmp_bitmask,	bitmask, lwm,	    1);
 	MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
@@ -469,11 +488,14 @@ static int create_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 	MLX5_SET(xrqc, xrqc, user_index, in->user_index);
 	MLX5_SET(xrqc, xrqc, cqn, in->cqn);
 	MLX5_SET(create_xrq_in, create_in, opcode, MLX5_CMD_OP_CREATE_XRQ);
+	MLX5_SET(create_xrq_in, create_in, uid, in->uid);
 	err = mlx5_cmd_exec(dev, create_in, inlen, create_out,
 			    sizeof(create_out));
 	kvfree(create_in);
-	if (!err)
+	if (!err) {
 		srq->srqn = MLX5_GET(create_xrq_out, create_out, xrqn);
+		srq->uid = in->uid;
+	}
 
 	return err;
 }
@@ -485,6 +507,7 @@ static int destroy_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
 
 	MLX5_SET(destroy_xrq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRQ);
 	MLX5_SET(destroy_xrq_in, in, xrqn,   srq->srqn);
+	MLX5_SET(destroy_xrq_in, in, uid, srq->uid);
 
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
@@ -500,6 +523,7 @@ static int arm_xrq_cmd(struct mlx5_core_dev *dev,
 	MLX5_SET(arm_rq_in, in, op_mod,     MLX5_ARM_RQ_IN_OP_MOD_XRQ);
 	MLX5_SET(arm_rq_in, in, srq_number, srq->srqn);
 	MLX5_SET(arm_rq_in, in, lwm,	    lwm);
+	MLX5_SET(arm_rq_in, in, uid, srq->uid);
 
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d885e9f0e054..8fb072aa8671 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -474,6 +474,7 @@ struct mlx5_core_srq {
 
 	atomic_t		refcount;
 	struct completion	free;
+	u16		uid;
 };
 
 struct mlx5_eq_table {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 72dd1e49a799..85f1237d80db 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5524,7 +5524,7 @@ struct mlx5_ifc_rmp_bitmask_bits {
 
 struct mlx5_ifc_modify_rmp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -5986,7 +5986,7 @@ struct mlx5_ifc_destroy_xrq_out_bits {
 
 struct mlx5_ifc_destroy_xrq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6008,7 +6008,7 @@ struct mlx5_ifc_destroy_xrc_srq_out_bits {
 
 struct mlx5_ifc_destroy_xrc_srq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6074,7 +6074,7 @@ struct mlx5_ifc_destroy_srq_out_bits {
 
 struct mlx5_ifc_destroy_srq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6208,7 +6208,7 @@ struct mlx5_ifc_destroy_rmp_out_bits {
 
 struct mlx5_ifc_destroy_rmp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6641,7 +6641,7 @@ struct mlx5_ifc_create_xrq_out_bits {
 
 struct mlx5_ifc_create_xrq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6665,7 +6665,7 @@ struct mlx5_ifc_create_xrc_srq_out_bits {
 
 struct mlx5_ifc_create_xrc_srq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6741,7 +6741,7 @@ struct mlx5_ifc_create_srq_out_bits {
 
 struct mlx5_ifc_create_srq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6871,7 +6871,7 @@ struct mlx5_ifc_create_rmp_out_bits {
 
 struct mlx5_ifc_create_rmp_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -7272,7 +7272,7 @@ enum {
 
 struct mlx5_ifc_arm_xrc_srq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -7300,7 +7300,7 @@ enum {
 
 struct mlx5_ifc_arm_rq_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h
index 24ff23e27c8a..1b1f3c20c6a3 100644
--- a/include/linux/mlx5/srq.h
+++ b/include/linux/mlx5/srq.h
@@ -61,6 +61,7 @@ struct mlx5_srq_attr {
 	u32 tm_next_tag;
 	u32 tm_hw_phase_cnt;
 	u32 tm_sw_phase_cnt;
+	u16 uid;
 };
 
 struct mlx5_core_dev;
-- 
cgit v1.2.3


From 774ea6eea29025218f75bc94c764df9a641db471 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 20 Sep 2018 21:35:25 +0300
Subject: net/mlx5: Set uid as part of DCT commands

Set uid as part of DCT commands so that the firmware can manage the
DCT object in a secured way.

That will enable using a DCT that was created by verbs application
to be used by the DEVX flow in case the uid is equal.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 4 ++++
 include/linux/mlx5/mlx5_ifc.h                | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index d9b12136cbfd..91b8139a388d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -211,6 +211,7 @@ int mlx5_core_create_dct(struct mlx5_core_dev *dev,
 	}
 
 	qp->qpn = MLX5_GET(create_dct_out, out, dctn);
+	qp->uid = MLX5_GET(create_dct_in, in, uid);
 	err = create_resource_common(dev, qp, MLX5_RES_DCT);
 	if (err)
 		goto err_cmd;
@@ -219,6 +220,7 @@ int mlx5_core_create_dct(struct mlx5_core_dev *dev,
 err_cmd:
 	MLX5_SET(destroy_dct_in, din, opcode, MLX5_CMD_OP_DESTROY_DCT);
 	MLX5_SET(destroy_dct_in, din, dctn, qp->qpn);
+	MLX5_SET(destroy_dct_in, din, uid, qp->uid);
 	mlx5_cmd_exec(dev, (void *)&in, sizeof(din),
 		      (void *)&out, sizeof(dout));
 	return err;
@@ -277,6 +279,7 @@ static int mlx5_core_drain_dct(struct mlx5_core_dev *dev,
 
 	MLX5_SET(drain_dct_in, in, opcode, MLX5_CMD_OP_DRAIN_DCT);
 	MLX5_SET(drain_dct_in, in, dctn, qp->qpn);
+	MLX5_SET(drain_dct_in, in, uid, qp->uid);
 	return mlx5_cmd_exec(dev, (void *)&in, sizeof(in),
 			     (void *)&out, sizeof(out));
 }
@@ -303,6 +306,7 @@ destroy:
 	destroy_resource_common(dev, &dct->mqp);
 	MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT);
 	MLX5_SET(destroy_dct_in, in, dctn, qp->qpn);
+	MLX5_SET(destroy_dct_in, in, uid, qp->uid);
 	err = mlx5_cmd_exec(dev, (void *)&in, sizeof(in),
 			    (void *)&out, sizeof(out));
 	return err;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 85f1237d80db..62c0592a9fdb 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5918,7 +5918,7 @@ struct mlx5_ifc_drain_dct_out_bits {
 
 struct mlx5_ifc_drain_dct_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6382,7 +6382,7 @@ struct mlx5_ifc_destroy_dct_out_bits {
 
 struct mlx5_ifc_destroy_dct_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -7138,7 +7138,7 @@ struct mlx5_ifc_create_dct_out_bits {
 
 struct mlx5_ifc_create_dct_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
-- 
cgit v1.2.3


From bd37197554eb28a7fc38e44e005e303c77f788ed Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Thu, 20 Sep 2018 21:35:26 +0300
Subject: net/mlx5: Update mlx5_ifc with DEVX UID bits

Add DEVX information to WQ, SRQ, CQ, TIR, TIS, QP,
RQ, XRCD, PD, MKEY and MCG.

Each object that is created/destroyed/modified via verbs will
be stamped with a UID based on its user context. This is already
done for DEVX objects commands.

This will enable the firmware to enforce the usage of kernel objects
from the DEVX flow by validating that the same UID is used and the
resources are really related to the same user.

The addition of *_valid fields are needed to distinguish
how various addresses are passed.

For non-DEVX callers, all those fields will be zero.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 67 +++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 62c0592a9fdb..68f4d5f9d929 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1290,7 +1290,9 @@ struct mlx5_ifc_wq_bits {
 	u8         reserved_at_118[0x3];
 	u8         log_wq_sz[0x5];
 
-	u8         reserved_at_120[0x3];
+	u8         dbr_umem_valid[0x1];
+	u8         wq_umem_valid[0x1];
+	u8         reserved_at_122[0x1];
 	u8         log_hairpin_num_packets[0x5];
 	u8         reserved_at_128[0x3];
 	u8         log_hairpin_data_sz[0x5];
@@ -2364,7 +2366,10 @@ struct mlx5_ifc_qpc_bits {
 
 	u8         dc_access_key[0x40];
 
-	u8         reserved_at_680[0xc0];
+	u8         reserved_at_680[0x3];
+	u8         dbr_umem_valid[0x1];
+
+	u8         reserved_at_684[0xbc];
 };
 
 struct mlx5_ifc_roce_addr_layout_bits {
@@ -2464,7 +2469,7 @@ struct mlx5_ifc_xrc_srqc_bits {
 
 	u8         wq_signature[0x1];
 	u8         cont_srq[0x1];
-	u8         reserved_at_22[0x1];
+	u8         dbr_umem_valid[0x1];
 	u8         rlky[0x1];
 	u8         basic_cyclic_rcv_wqe[0x1];
 	u8         log_rq_stride[0x3];
@@ -3128,7 +3133,9 @@ enum {
 
 struct mlx5_ifc_cqc_bits {
 	u8         status[0x4];
-	u8         reserved_at_4[0x4];
+	u8         reserved_at_4[0x2];
+	u8         dbr_umem_valid[0x1];
+	u8         reserved_at_7[0x1];
 	u8         cqe_sz[0x3];
 	u8         cc[0x1];
 	u8         reserved_at_c[0x1];
@@ -5314,7 +5321,7 @@ struct mlx5_ifc_modify_tis_bitmask_bits {
 
 struct mlx5_ifc_modify_tis_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -5353,7 +5360,7 @@ struct mlx5_ifc_modify_tir_out_bits {
 
 struct mlx5_ifc_modify_tir_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -5454,7 +5461,7 @@ struct mlx5_ifc_rqt_bitmask_bits {
 
 struct mlx5_ifc_modify_rqt_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -5641,7 +5648,10 @@ struct mlx5_ifc_modify_cq_in_bits {
 
 	struct mlx5_ifc_cqc_bits cq_context;
 
-	u8         reserved_at_280[0x600];
+	u8         reserved_at_280[0x40];
+
+	u8         cq_umem_valid[0x1];
+	u8         reserved_at_2c1[0x5bf];
 
 	u8         pas[0][0x40];
 };
@@ -5962,7 +5972,7 @@ struct mlx5_ifc_detach_from_mcg_out_bits {
 
 struct mlx5_ifc_detach_from_mcg_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6030,7 +6040,7 @@ struct mlx5_ifc_destroy_tis_out_bits {
 
 struct mlx5_ifc_destroy_tis_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6052,7 +6062,7 @@ struct mlx5_ifc_destroy_tir_out_bits {
 
 struct mlx5_ifc_destroy_tir_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6142,7 +6152,7 @@ struct mlx5_ifc_destroy_rqt_out_bits {
 
 struct mlx5_ifc_destroy_rqt_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6507,7 +6517,7 @@ struct mlx5_ifc_dealloc_xrcd_out_bits {
 
 struct mlx5_ifc_dealloc_xrcd_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6595,7 +6605,7 @@ struct mlx5_ifc_dealloc_pd_out_bits {
 
 struct mlx5_ifc_dealloc_pd_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6674,7 +6684,9 @@ struct mlx5_ifc_create_xrc_srq_in_bits {
 
 	struct mlx5_ifc_xrc_srqc_bits xrc_srq_context_entry;
 
-	u8         reserved_at_280[0x600];
+	u8         reserved_at_280[0x40];
+	u8         xrc_srq_umem_valid[0x1];
+	u8         reserved_at_2c1[0x5bf];
 
 	u8         pas[0][0x40];
 };
@@ -6693,7 +6705,7 @@ struct mlx5_ifc_create_tis_out_bits {
 
 struct mlx5_ifc_create_tis_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6717,7 +6729,7 @@ struct mlx5_ifc_create_tir_out_bits {
 
 struct mlx5_ifc_create_tir_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6823,7 +6835,7 @@ struct mlx5_ifc_create_rqt_out_bits {
 
 struct mlx5_ifc_create_rqt_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -6908,7 +6920,10 @@ struct mlx5_ifc_create_qp_in_bits {
 
 	struct mlx5_ifc_qpc_bits qpc;
 
-	u8         reserved_at_800[0x80];
+	u8         reserved_at_800[0x60];
+
+	u8         wq_umem_valid[0x1];
+	u8         reserved_at_861[0x1f];
 
 	u8         pas[0][0x40];
 };
@@ -6970,7 +6985,8 @@ struct mlx5_ifc_create_mkey_in_bits {
 	u8         reserved_at_40[0x20];
 
 	u8         pg_access[0x1];
-	u8         reserved_at_61[0x1f];
+	u8         mkey_umem_valid[0x1];
+	u8         reserved_at_62[0x1e];
 
 	struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
 
@@ -7173,7 +7189,10 @@ struct mlx5_ifc_create_cq_in_bits {
 
 	struct mlx5_ifc_cqc_bits cq_context;
 
-	u8         reserved_at_280[0x600];
+	u8         reserved_at_280[0x60];
+
+	u8         cq_umem_valid[0x1];
+	u8         reserved_at_2e1[0x59f];
 
 	u8         pas[0][0x40];
 };
@@ -7221,7 +7240,7 @@ struct mlx5_ifc_attach_to_mcg_out_bits {
 
 struct mlx5_ifc_attach_to_mcg_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -7348,7 +7367,7 @@ struct mlx5_ifc_alloc_xrcd_out_bits {
 
 struct mlx5_ifc_alloc_xrcd_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -7436,7 +7455,7 @@ struct mlx5_ifc_alloc_pd_out_bits {
 
 struct mlx5_ifc_alloc_pd_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
-- 
cgit v1.2.3


From f5bd91388e26557f64ca999e0006038c7a919308 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 7 Sep 2018 10:18:46 +0200
Subject: net: xsk: add a simple buffer reuse queue

XSK UMEM is strongly single producer single consumer so reuse of
frames is challenging.  Add a simple "stash" of FILL packets to
reuse for drivers to optionally make use of.  This is useful
when driver has to free (ndo_stop) or resize a ring with an active
AF_XDP ZC socket.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/net/xdp_sock.h | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xdp_umem.c     |  2 ++
 net/xdp/xsk_queue.c    | 55 ++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h    |  3 +++
 4 files changed, 129 insertions(+)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 932ca0dad6f3..70a115bea4f4 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -21,6 +21,12 @@ struct xdp_umem_page {
 	dma_addr_t dma;
 };
 
+struct xdp_umem_fq_reuse {
+	u32 nentries;
+	u32 length;
+	u64 handles[];
+};
+
 struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
@@ -37,6 +43,7 @@ struct xdp_umem {
 	struct page **pgs;
 	u32 npgs;
 	struct net_device *dev;
+	struct xdp_umem_fq_reuse *fq_reuse;
 	u16 queue_id;
 	bool zc;
 	spinlock_t xsk_list_lock;
@@ -75,6 +82,10 @@ void xsk_umem_discard_addr(struct xdp_umem *umem);
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
 bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
 void xsk_umem_consume_tx_done(struct xdp_umem *umem);
+struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
+struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
+					  struct xdp_umem_fq_reuse *newq);
+void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
@@ -85,6 +96,35 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
 {
 	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
 }
+
+/* Reuse-queue aware version of FILL queue helpers */
+static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
+{
+	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+	if (!rq->length)
+		return xsk_umem_peek_addr(umem, addr);
+
+	*addr = rq->handles[rq->length - 1];
+	return addr;
+}
+
+static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem)
+{
+	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+	if (!rq->length)
+		xsk_umem_discard_addr(umem);
+	else
+		rq->length--;
+}
+
+static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
+{
+	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+	rq->handles[rq->length++] = addr;
+}
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
@@ -128,6 +168,21 @@ static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
 {
 }
 
+static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
+{
+	return NULL;
+}
+
+static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap(
+	struct xdp_umem *umem,
+	struct xdp_umem_fq_reuse *newq)
+{
+	return NULL;
+}
+static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
+{
+}
+
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
 	return NULL;
@@ -137,6 +192,20 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
 {
 	return 0;
 }
+
+static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
+{
+	return NULL;
+}
+
+static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
+{
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index b3b632c5aeae..555427b3e0fe 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -165,6 +165,8 @@ static void xdp_umem_release(struct xdp_umem *umem)
 		umem->cq = NULL;
 	}
 
+	xsk_reuseq_destroy(umem);
+
 	xdp_umem_unpin_pages(umem);
 
 	task = get_pid_task(umem->pid, PIDTYPE_PID);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 2dc1384d9f27..b66504592d9b 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -3,7 +3,9 @@
  * Copyright(c) 2018 Intel Corporation.
  */
 
+#include <linux/log2.h>
 #include <linux/slab.h>
+#include <linux/overflow.h>
 
 #include "xsk_queue.h"
 
@@ -62,3 +64,56 @@ void xskq_destroy(struct xsk_queue *q)
 	page_frag_free(q->ring);
 	kfree(q);
 }
+
+struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
+{
+	struct xdp_umem_fq_reuse *newq;
+
+	/* Check for overflow */
+	if (nentries > (u32)roundup_pow_of_two(nentries))
+		return NULL;
+	nentries = roundup_pow_of_two(nentries);
+
+	newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL);
+	if (!newq)
+		return NULL;
+	memset(newq, 0, offsetof(typeof(*newq), handles));
+
+	newq->nentries = nentries;
+	return newq;
+}
+EXPORT_SYMBOL_GPL(xsk_reuseq_prepare);
+
+struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
+					  struct xdp_umem_fq_reuse *newq)
+{
+	struct xdp_umem_fq_reuse *oldq = umem->fq_reuse;
+
+	if (!oldq) {
+		umem->fq_reuse = newq;
+		return NULL;
+	}
+
+	if (newq->nentries < oldq->length)
+		return newq;
+
+	memcpy(newq->handles, oldq->handles,
+	       array_size(oldq->length, sizeof(u64)));
+	newq->length = oldq->length;
+
+	umem->fq_reuse = newq;
+	return oldq;
+}
+EXPORT_SYMBOL_GPL(xsk_reuseq_swap);
+
+void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
+{
+	kvfree(rq);
+}
+EXPORT_SYMBOL_GPL(xsk_reuseq_free);
+
+void xsk_reuseq_destroy(struct xdp_umem *umem)
+{
+	xsk_reuseq_free(umem->fq_reuse);
+	umem->fq_reuse = NULL;
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 82252cccb4e0..bcb5cbb40419 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -258,4 +258,7 @@ void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask);
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q_ops);
 
+/* Executed by the core when the entire UMEM gets freed */
+void xsk_reuseq_destroy(struct xdp_umem *umem);
+
 #endif /* _LINUX_XSK_QUEUE_H */
-- 
cgit v1.2.3


From 6f99528e9797794b91b43321fbbc93fe772b0803 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Mon, 24 Sep 2018 19:22:49 +0300
Subject: net: core: netlink: add helper refcount dec and lock function

Rtnl lock is encapsulated in netlink and cannot be accessed by other
modules directly. This means that reference counted objects that rely on
rtnl lock cannot use it with refcounter helper function that atomically
releases decrements reference and obtains mutex.

This patch implements simple wrapper function around refcount_dec_and_lock
that obtains rtnl lock if reference counter value reached 0.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 2 ++
 net/core/rtnetlink.c      | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 5225832bd6ff..9cdd76348d9a 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -6,6 +6,7 @@
 #include <linux/mutex.h>
 #include <linux/netdevice.h>
 #include <linux/wait.h>
+#include <linux/refcount.h>
 #include <uapi/linux/rtnetlink.h>
 
 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
@@ -34,6 +35,7 @@ extern void rtnl_unlock(void);
 extern int rtnl_trylock(void);
 extern int rtnl_is_locked(void);
 extern int rtnl_lock_killable(void);
+extern bool refcount_dec_and_rtnl_lock(refcount_t *r);
 
 extern wait_queue_head_t netdev_unregistering_wq;
 extern struct rw_semaphore pernet_ops_rwsem;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 80a7e18c65fb..35162e1b06ad 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -130,6 +130,12 @@ int rtnl_is_locked(void)
 }
 EXPORT_SYMBOL(rtnl_is_locked);
 
+bool refcount_dec_and_rtnl_lock(refcount_t *r)
+{
+	return refcount_dec_and_mutex_lock(r, &rtnl_mutex);
+}
+EXPORT_SYMBOL(refcount_dec_and_rtnl_lock);
+
 #ifdef CONFIG_PROVE_LOCKING
 bool lockdep_rtnl_is_held(void)
 {
-- 
cgit v1.2.3


From 86bd446b5cebd783187ea3772ff258210de77d99 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Mon, 24 Sep 2018 19:22:50 +0300
Subject: net: sched: rename qdisc_destroy() to qdisc_put()

Current implementation of qdisc_destroy() decrements Qdisc reference
counter and only actually destroy Qdisc if reference counter value reached
zero. Rename qdisc_destroy() to qdisc_put() in order for it to better
describe the way in which this function currently implemented and used.

Extract code that deallocates Qdisc into new private qdisc_destroy()
function. It is intended to be shared between regular qdisc_put() and its
unlocked version that is introduced in next patch in this series.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  2 +-
 net/sched/sch_api.c       |  6 +++---
 net/sched/sch_atm.c       |  2 +-
 net/sched/sch_cbq.c       |  2 +-
 net/sched/sch_cbs.c       |  2 +-
 net/sched/sch_drr.c       |  4 ++--
 net/sched/sch_dsmark.c    |  2 +-
 net/sched/sch_fifo.c      |  2 +-
 net/sched/sch_generic.c   | 23 ++++++++++++++---------
 net/sched/sch_hfsc.c      |  2 +-
 net/sched/sch_htb.c       |  4 ++--
 net/sched/sch_mq.c        |  4 ++--
 net/sched/sch_mqprio.c    |  4 ++--
 net/sched/sch_multiq.c    |  6 +++---
 net/sched/sch_netem.c     |  2 +-
 net/sched/sch_prio.c      |  6 +++---
 net/sched/sch_qfq.c       |  4 ++--
 net/sched/sch_red.c       |  4 ++--
 net/sched/sch_sfb.c       |  4 ++--
 net/sched/sch_tbf.c       |  4 ++--
 20 files changed, 47 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index d326fd553b58..fadb1a4d4ee8 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -554,7 +554,7 @@ void dev_deactivate_many(struct list_head *head);
 struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 			      struct Qdisc *qdisc);
 void qdisc_reset(struct Qdisc *qdisc);
-void qdisc_destroy(struct Qdisc *qdisc);
+void qdisc_put(struct Qdisc *qdisc);
 void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n,
 			       unsigned int len);
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 411c40344b77..2096138c4bf6 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -920,7 +920,7 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 		qdisc_notify(net, skb, n, clid, old, new);
 
 	if (old)
-		qdisc_destroy(old);
+		qdisc_put(old);
 }
 
 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
@@ -973,7 +973,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 				qdisc_refcount_inc(new);
 
 			if (!ingress)
-				qdisc_destroy(old);
+				qdisc_put(old);
 		}
 
 skip:
@@ -1561,7 +1561,7 @@ graft:
 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
 	if (err) {
 		if (q)
-			qdisc_destroy(q);
+			qdisc_put(q);
 		return err;
 	}
 
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index cd49afca9617..d714d3747bcb 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -150,7 +150,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
 	pr_debug("atm_tc_put: destroying\n");
 	list_del_init(&flow->list);
 	pr_debug("atm_tc_put: qdisc %p\n", flow->q);
-	qdisc_destroy(flow->q);
+	qdisc_put(flow->q);
 	tcf_block_put(flow->block);
 	if (flow->sock) {
 		pr_debug("atm_tc_put: f_count %ld\n",
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index f42025d53cfe..4dc05409e3fb 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1418,7 +1418,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
 	WARN_ON(cl->filters);
 
 	tcf_block_put(cl->block);
-	qdisc_destroy(cl->q);
+	qdisc_put(cl->q);
 	qdisc_put_rtab(cl->R_tab);
 	gen_kill_estimator(&cl->rate_est);
 	if (cl != &q->link)
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index e26a24017faa..e689e11b6d0f 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -379,7 +379,7 @@ static void cbs_destroy(struct Qdisc *sch)
 	cbs_disable_offload(dev, q);
 
 	if (q->qdisc)
-		qdisc_destroy(q->qdisc);
+		qdisc_put(q->qdisc);
 }
 
 static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index e0b0cf8a9939..cdebaed0f8cf 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -134,7 +134,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 					    tca[TCA_RATE]);
 		if (err) {
 			NL_SET_ERR_MSG(extack, "Failed to replace estimator");
-			qdisc_destroy(cl->qdisc);
+			qdisc_put(cl->qdisc);
 			kfree(cl);
 			return err;
 		}
@@ -153,7 +153,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
 {
 	gen_kill_estimator(&cl->rate_est);
-	qdisc_destroy(cl->qdisc);
+	qdisc_put(cl->qdisc);
 	kfree(cl);
 }
 
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 049714c57075..f6f480784bc6 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -412,7 +412,7 @@ static void dsmark_destroy(struct Qdisc *sch)
 	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 
 	tcf_block_put(p->block);
-	qdisc_destroy(p->q);
+	qdisc_put(p->q);
 	if (p->mv != p->embedded)
 		kfree(p->mv);
 }
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 24893d3b5d22..3809c9bf8896 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -177,7 +177,7 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
 	if (q) {
 		err = fifo_set_limit(q, limit);
 		if (err < 0) {
-			qdisc_destroy(q);
+			qdisc_put(q);
 			q = NULL;
 		}
 	}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a64132a5db36..3e7696f3e053 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -901,7 +901,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
 	if (!ops->init || ops->init(sch, NULL, extack) == 0)
 		return sch;
 
-	qdisc_destroy(sch);
+	qdisc_put(sch);
 	return NULL;
 }
 EXPORT_SYMBOL(qdisc_create_dflt);
@@ -941,15 +941,11 @@ void qdisc_free(struct Qdisc *qdisc)
 	kfree((char *) qdisc - qdisc->padded);
 }
 
-void qdisc_destroy(struct Qdisc *qdisc)
+static void qdisc_destroy(struct Qdisc *qdisc)
 {
 	const struct Qdisc_ops  *ops = qdisc->ops;
 	struct sk_buff *skb, *tmp;
 
-	if (qdisc->flags & TCQ_F_BUILTIN ||
-	    !refcount_dec_and_test(&qdisc->refcnt))
-		return;
-
 #ifdef CONFIG_NET_SCHED
 	qdisc_hash_del(qdisc);
 
@@ -976,7 +972,16 @@ void qdisc_destroy(struct Qdisc *qdisc)
 
 	qdisc_free(qdisc);
 }
-EXPORT_SYMBOL(qdisc_destroy);
+
+void qdisc_put(struct Qdisc *qdisc)
+{
+	if (qdisc->flags & TCQ_F_BUILTIN ||
+	    !refcount_dec_and_test(&qdisc->refcnt))
+		return;
+
+	qdisc_destroy(qdisc);
+}
+EXPORT_SYMBOL(qdisc_put);
 
 /* Attach toplevel qdisc to device queue. */
 struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
@@ -1270,7 +1275,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
 		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
 		dev_queue->qdisc_sleeping = qdisc_default;
 
-		qdisc_destroy(qdisc);
+		qdisc_put(qdisc);
 	}
 }
 
@@ -1279,7 +1284,7 @@ void dev_shutdown(struct net_device *dev)
 	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
 	if (dev_ingress_queue(dev))
 		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
-	qdisc_destroy(dev->qdisc);
+	qdisc_put(dev->qdisc);
 	dev->qdisc = &noop_qdisc;
 
 	WARN_ON(timer_pending(&dev->watchdog_timer));
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3278a76f6861..b18ec1f6de60 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1092,7 +1092,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
 	struct hfsc_sched *q = qdisc_priv(sch);
 
 	tcf_block_put(cl->block);
-	qdisc_destroy(cl->qdisc);
+	qdisc_put(cl->qdisc);
 	gen_kill_estimator(&cl->rate_est);
 	if (cl != &q->root)
 		kfree(cl);
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 18ac2d6ca294..58b449490757 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1208,7 +1208,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 {
 	if (!cl->level) {
 		WARN_ON(!cl->leaf.q);
-		qdisc_destroy(cl->leaf.q);
+		qdisc_put(cl->leaf.q);
 	}
 	gen_kill_estimator(&cl->rate_est);
 	tcf_block_put(cl->block);
@@ -1409,7 +1409,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 			/* turn parent into inner node */
 			qdisc_reset(parent->leaf.q);
 			qdisc_tree_reduce_backlog(parent->leaf.q, qlen, backlog);
-			qdisc_destroy(parent->leaf.q);
+			qdisc_put(parent->leaf.q);
 			if (parent->prio_activity)
 				htb_deactivate(q, parent);
 
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index d6b8ae4ed7a3..f20f3a0f8424 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -65,7 +65,7 @@ static void mq_destroy(struct Qdisc *sch)
 	if (!priv->qdiscs)
 		return;
 	for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
-		qdisc_destroy(priv->qdiscs[ntx]);
+		qdisc_put(priv->qdiscs[ntx]);
 	kfree(priv->qdiscs);
 }
 
@@ -119,7 +119,7 @@ static void mq_attach(struct Qdisc *sch)
 		qdisc = priv->qdiscs[ntx];
 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
 		if (old)
-			qdisc_destroy(old);
+			qdisc_put(old);
 #ifdef CONFIG_NET_SCHED
 		if (ntx < dev->real_num_tx_queues)
 			qdisc_hash_add(qdisc, false);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 0e9d761cdd80..d364e63c396d 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -40,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch)
 		for (ntx = 0;
 		     ntx < dev->num_tx_queues && priv->qdiscs[ntx];
 		     ntx++)
-			qdisc_destroy(priv->qdiscs[ntx]);
+			qdisc_put(priv->qdiscs[ntx]);
 		kfree(priv->qdiscs);
 	}
 
@@ -300,7 +300,7 @@ static void mqprio_attach(struct Qdisc *sch)
 		qdisc = priv->qdiscs[ntx];
 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
 		if (old)
-			qdisc_destroy(old);
+			qdisc_put(old);
 		if (ntx < dev->real_num_tx_queues)
 			qdisc_hash_add(qdisc, false);
 	}
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 1da7ea8de0ad..7410ce4d0321 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -175,7 +175,7 @@ multiq_destroy(struct Qdisc *sch)
 
 	tcf_block_put(q->block);
 	for (band = 0; band < q->bands; band++)
-		qdisc_destroy(q->queues[band]);
+		qdisc_put(q->queues[band]);
 
 	kfree(q->queues);
 }
@@ -204,7 +204,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
 			q->queues[i] = &noop_qdisc;
 			qdisc_tree_reduce_backlog(child, child->q.qlen,
 						  child->qstats.backlog);
-			qdisc_destroy(child);
+			qdisc_put(child);
 		}
 	}
 
@@ -228,7 +228,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
 					qdisc_tree_reduce_backlog(old,
 								  old->q.qlen,
 								  old->qstats.backlog);
-					qdisc_destroy(old);
+					qdisc_put(old);
 				}
 				sch_tree_unlock(sch);
 			}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 506e1960ed7f..57b3ad9394ad 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1022,7 +1022,7 @@ static void netem_destroy(struct Qdisc *sch)
 
 	qdisc_watchdog_cancel(&q->watchdog);
 	if (q->qdisc)
-		qdisc_destroy(q->qdisc);
+		qdisc_put(q->qdisc);
 	dist_free(q->delay_dist);
 	dist_free(q->slot_dist);
 }
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 222e53d3d27a..f8af98621179 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -175,7 +175,7 @@ prio_destroy(struct Qdisc *sch)
 	tcf_block_put(q->block);
 	prio_offload(sch, NULL);
 	for (prio = 0; prio < q->bands; prio++)
-		qdisc_destroy(q->queues[prio]);
+		qdisc_put(q->queues[prio]);
 }
 
 static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
@@ -205,7 +205,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
 					      extack);
 		if (!queues[i]) {
 			while (i > oldbands)
-				qdisc_destroy(queues[--i]);
+				qdisc_put(queues[--i]);
 			return -ENOMEM;
 		}
 	}
@@ -220,7 +220,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
 
 		qdisc_tree_reduce_backlog(child, child->q.qlen,
 					  child->qstats.backlog);
-		qdisc_destroy(child);
+		qdisc_put(child);
 	}
 
 	for (i = oldbands; i < q->bands; i++) {
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index bb1a9c11fc54..dc37c4ead439 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -526,7 +526,7 @@ set_change_agg:
 	return 0;
 
 destroy_class:
-	qdisc_destroy(cl->qdisc);
+	qdisc_put(cl->qdisc);
 	kfree(cl);
 	return err;
 }
@@ -537,7 +537,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
 
 	qfq_rm_from_agg(q, cl);
 	gen_kill_estimator(&cl->rate_est);
-	qdisc_destroy(cl->qdisc);
+	qdisc_put(cl->qdisc);
 	kfree(cl);
 }
 
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 56c181c3feeb..3ce6c0a2c493 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -181,7 +181,7 @@ static void red_destroy(struct Qdisc *sch)
 
 	del_timer_sync(&q->adapt_timer);
 	red_offload(sch, false);
-	qdisc_destroy(q->qdisc);
+	qdisc_put(q->qdisc);
 }
 
 static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
@@ -233,7 +233,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
 	if (child) {
 		qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
 					  q->qdisc->qstats.backlog);
-		qdisc_destroy(q->qdisc);
+		qdisc_put(q->qdisc);
 		q->qdisc = child;
 	}
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 7cbdad8419b7..bab506b01a32 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -469,7 +469,7 @@ static void sfb_destroy(struct Qdisc *sch)
 	struct sfb_sched_data *q = qdisc_priv(sch);
 
 	tcf_block_put(q->block);
-	qdisc_destroy(q->qdisc);
+	qdisc_put(q->qdisc);
 }
 
 static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
@@ -523,7 +523,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
 
 	qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
 				  q->qdisc->qstats.backlog);
-	qdisc_destroy(q->qdisc);
+	qdisc_put(q->qdisc);
 	q->qdisc = child;
 
 	q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index a4530e85bd02..942dcca09cf2 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -392,7 +392,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 	if (child) {
 		qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
 					  q->qdisc->qstats.backlog);
-		qdisc_destroy(q->qdisc);
+		qdisc_put(q->qdisc);
 		q->qdisc = child;
 	}
 	q->limit = qopt->limit;
@@ -438,7 +438,7 @@ static void tbf_destroy(struct Qdisc *sch)
 	struct tbf_sched_data *q = qdisc_priv(sch);
 
 	qdisc_watchdog_cancel(&q->watchdog);
-	qdisc_destroy(q->qdisc);
+	qdisc_put(q->qdisc);
 }
 
 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
-- 
cgit v1.2.3


From 3a7d0d07a386716b459b00783b11a8211cefcc0f Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Mon, 24 Sep 2018 19:22:51 +0300
Subject: net: sched: extend Qdisc with rcu

Currently, Qdisc API functions assume that users have rtnl lock taken. To
implement rtnl unlocked classifiers update interface, Qdisc API must be
extended with functions that do not require rtnl lock.

Extend Qdisc structure with rcu. Implement special version of put function
qdisc_put_unlocked() that is called without rtnl lock taken. This function
only takes rtnl lock if Qdisc reference counter reached zero and is
intended to be used as optimization.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |  5 +++++
 include/net/pkt_sched.h   |  1 +
 include/net/sch_generic.h |  2 ++
 net/sched/sch_api.c       | 18 ++++++++++++++++++
 net/sched/sch_generic.c   | 25 ++++++++++++++++++++++++-
 5 files changed, 50 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 9cdd76348d9a..bb9cb84114c1 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -85,6 +85,11 @@ static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
 	return rtnl_dereference(dev->ingress_queue);
 }
 
+static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev)
+{
+	return rcu_dereference(dev->ingress_queue);
+}
+
 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
 
 #ifdef CONFIG_NET_INGRESS
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 7dc769e5452b..a16fbe9a2a67 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -102,6 +102,7 @@ int qdisc_set_default(const char *id);
 void qdisc_hash_add(struct Qdisc *q, bool invisible);
 void qdisc_hash_del(struct Qdisc *q);
 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle);
+struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle);
 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
 					struct nlattr *tab,
 					struct netlink_ext_ack *extack);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index fadb1a4d4ee8..091b40c198ff 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -105,6 +105,7 @@ struct Qdisc {
 
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
 	spinlock_t		seqlock;
+	struct rcu_head		rcu;
 };
 
 static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
@@ -555,6 +556,7 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 			      struct Qdisc *qdisc);
 void qdisc_reset(struct Qdisc *qdisc);
 void qdisc_put(struct Qdisc *qdisc);
+void qdisc_put_unlocked(struct Qdisc *qdisc);
 void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n,
 			       unsigned int len);
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 2096138c4bf6..22e9799e5b69 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -314,6 +314,24 @@ out:
 	return q;
 }
 
+struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
+{
+	struct netdev_queue *nq;
+	struct Qdisc *q;
+
+	if (!handle)
+		return NULL;
+	q = qdisc_match_from_root(dev->qdisc, handle);
+	if (q)
+		goto out;
+
+	nq = dev_ingress_queue_rcu(dev);
+	if (nq)
+		q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
+out:
+	return q;
+}
+
 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 {
 	unsigned long cl;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3e7696f3e053..531fac1d2875 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -941,6 +941,13 @@ void qdisc_free(struct Qdisc *qdisc)
 	kfree((char *) qdisc - qdisc->padded);
 }
 
+void qdisc_free_cb(struct rcu_head *head)
+{
+	struct Qdisc *q = container_of(head, struct Qdisc, rcu);
+
+	qdisc_free(q);
+}
+
 static void qdisc_destroy(struct Qdisc *qdisc)
 {
 	const struct Qdisc_ops  *ops = qdisc->ops;
@@ -970,7 +977,7 @@ static void qdisc_destroy(struct Qdisc *qdisc)
 		kfree_skb_list(skb);
 	}
 
-	qdisc_free(qdisc);
+	call_rcu(&qdisc->rcu, qdisc_free_cb);
 }
 
 void qdisc_put(struct Qdisc *qdisc)
@@ -983,6 +990,22 @@ void qdisc_put(struct Qdisc *qdisc)
 }
 EXPORT_SYMBOL(qdisc_put);
 
+/* Version of qdisc_put() that is called with rtnl mutex unlocked.
+ * Intended to be used as optimization, this function only takes rtnl lock if
+ * qdisc reference counter reached zero.
+ */
+
+void qdisc_put_unlocked(struct Qdisc *qdisc)
+{
+	if (qdisc->flags & TCQ_F_BUILTIN ||
+	    !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
+		return;
+
+	qdisc_destroy(qdisc);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL(qdisc_put_unlocked);
+
 /* Attach toplevel qdisc to device queue. */
 struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 			      struct Qdisc *qdisc)
-- 
cgit v1.2.3


From 9d7e82cec35c027756ec97e274f878251f271181 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Mon, 24 Sep 2018 19:22:52 +0300
Subject: net: sched: add helper function to take reference to Qdisc

Implement function to take reference to Qdisc that relies on rcu read lock
instead of rtnl mutex. Function only takes reference to Qdisc if reference
counter isn't zero. Intended to be used by unlocked cls API.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 091b40c198ff..43b17f82d8ee 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -115,6 +115,19 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
 	refcount_inc(&qdisc->refcnt);
 }
 
+/* Intended to be used by unlocked users, when concurrent qdisc release is
+ * possible.
+ */
+
+static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc)
+{
+	if (qdisc->flags & TCQ_F_BUILTIN)
+		return qdisc;
+	if (refcount_inc_not_zero(&qdisc->refcnt))
+		return qdisc;
+	return NULL;
+}
+
 static inline bool qdisc_is_running(struct Qdisc *qdisc)
 {
 	if (qdisc->flags & TCQ_F_NOLOCK)
-- 
cgit v1.2.3


From cfebd7e242d7193a9901222b3e667788810d98c1 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Mon, 24 Sep 2018 19:22:54 +0300
Subject: net: sched: change tcf block reference counter type to refcount_t

As a preparation for removing rtnl lock dependency from rules update path,
change tcf block reference counter type to refcount_t to allow modification
by concurrent users.

In block put function perform decrement and check reference counter once to
accommodate concurrent modification by unlocked users. After this change
tcf_chain_put at the end of block put function is called with
block->refcnt==0 and will deallocate block after the last chain is
released, so there is no need to manually deallocate block in this case.
However, if block reference counter reached 0 and there are no chains to
release, block must still be deallocated manually.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  2 +-
 net/sched/cls_api.c       | 59 ++++++++++++++++++++++++++++-------------------
 2 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 43b17f82d8ee..4a86f4d33f07 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -345,7 +345,7 @@ struct tcf_chain {
 struct tcf_block {
 	struct list_head chain_list;
 	u32 index; /* block index for shared blocks */
-	unsigned int refcnt;
+	refcount_t refcnt;
 	struct net *net;
 	struct Qdisc *q;
 	struct list_head cb_list;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index c33636e7b431..90843b6a8fa9 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -240,7 +240,7 @@ static void tcf_chain_destroy(struct tcf_chain *chain)
 	if (!chain->index)
 		block->chain0.chain = NULL;
 	kfree(chain);
-	if (list_empty(&block->chain_list) && block->refcnt == 0)
+	if (list_empty(&block->chain_list) && !refcount_read(&block->refcnt))
 		kfree(block);
 }
 
@@ -510,7 +510,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
 	INIT_LIST_HEAD(&block->owner_list);
 	INIT_LIST_HEAD(&block->chain0.filter_chain_list);
 
-	block->refcnt = 1;
+	refcount_set(&block->refcnt, 1);
 	block->net = net;
 	block->index = block_index;
 
@@ -710,7 +710,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		/* block_index not 0 means the shared block is requested */
 		block = tcf_block_lookup(net, ei->block_index);
 		if (block)
-			block->refcnt++;
+			refcount_inc(&block->refcnt);
 	}
 
 	if (!block) {
@@ -753,7 +753,7 @@ err_block_owner_add:
 err_block_insert:
 		kfree(block);
 	} else {
-		block->refcnt--;
+		refcount_dec(&block->refcnt);
 	}
 	return err;
 }
@@ -793,34 +793,45 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 	tcf_chain0_head_change_cb_del(block, ei);
 	tcf_block_owner_del(block, q, ei->binder_type);
 
-	if (block->refcnt == 1) {
-		if (tcf_block_shared(block))
-			tcf_block_remove(block, block->net);
-
-		/* Hold a refcnt for all chains, so that they don't disappear
-		 * while we are iterating.
+	if (refcount_dec_and_test(&block->refcnt)) {
+		/* Flushing/putting all chains will cause the block to be
+		 * deallocated when last chain is freed. However, if chain_list
+		 * is empty, block has to be manually deallocated. After block
+		 * reference counter reached 0, it is no longer possible to
+		 * increment it or add new chains to block.
 		 */
-		list_for_each_entry(chain, &block->chain_list, list)
-			tcf_chain_hold(chain);
+		bool free_block = list_empty(&block->chain_list);
 
-		list_for_each_entry(chain, &block->chain_list, list)
-			tcf_chain_flush(chain);
-	}
+		if (tcf_block_shared(block))
+			tcf_block_remove(block, block->net);
 
-	tcf_block_offload_unbind(block, q, ei);
+		if (!free_block) {
+			/* Hold a refcnt for all chains, so that they don't
+			 * disappear while we are iterating.
+			 */
+			list_for_each_entry(chain, &block->chain_list, list)
+				tcf_chain_hold(chain);
 
-	if (block->refcnt == 1) {
-		/* At this point, all the chains should have refcnt >= 1. */
-		list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
-			tcf_chain_put_explicitly_created(chain);
-			tcf_chain_put(chain);
+			list_for_each_entry(chain, &block->chain_list, list)
+				tcf_chain_flush(chain);
 		}
 
-		block->refcnt--;
-		if (list_empty(&block->chain_list))
+		tcf_block_offload_unbind(block, q, ei);
+
+		if (free_block) {
 			kfree(block);
+		} else {
+			/* At this point, all the chains should have
+			 * refcnt >= 1.
+			 */
+			list_for_each_entry_safe(chain, tmp, &block->chain_list,
+						 list) {
+				tcf_chain_put_explicitly_created(chain);
+				tcf_chain_put(chain);
+			}
+		}
 	} else {
-		block->refcnt--;
+		tcf_block_offload_unbind(block, q, ei);
 	}
 }
 EXPORT_SYMBOL(tcf_block_put_ext);
-- 
cgit v1.2.3


From 0607e439943bd150e53eed2979f9c69aa61c37ce Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Mon, 24 Sep 2018 19:22:57 +0300
Subject: net: sched: implement tcf_block_refcnt_{get|put}()

Implement get/put function for blocks that only take/release the reference
and perform deallocation. These functions are intended to be used by
unlocked rules update path to always hold reference to block while working
with it. They use on new fine-grained locking mechanisms introduced in
previous patches in this set, instead of relying on global protection
provided by rtnl lock.

Extract code that is common with tcf_block_detach_ext() into common
function __tcf_block_put().

Extend tcf_block with rcu to allow safe deallocation when it is accessed
concurrently.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  1 +
 net/sched/cls_api.c       | 74 ++++++++++++++++++++++++++++++++---------------
 2 files changed, 51 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 4a86f4d33f07..7a6b71ee5433 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -357,6 +357,7 @@ struct tcf_block {
 		struct tcf_chain *chain;
 		struct list_head filter_chain_list;
 	} chain0;
+	struct rcu_head rcu;
 };
 
 static inline void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 5a21888d0ee9..49e3c10532ad 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -241,7 +241,7 @@ static void tcf_chain_destroy(struct tcf_chain *chain)
 		block->chain0.chain = NULL;
 	kfree(chain);
 	if (list_empty(&block->chain_list) && !refcount_read(&block->refcnt))
-		kfree(block);
+		kfree_rcu(block, rcu);
 }
 
 static void tcf_chain_hold(struct tcf_chain *chain)
@@ -537,6 +537,19 @@ static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
 	return idr_find(&tn->idr, block_index);
 }
 
+static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index)
+{
+	struct tcf_block *block;
+
+	rcu_read_lock();
+	block = tcf_block_lookup(net, block_index);
+	if (block && !refcount_inc_not_zero(&block->refcnt))
+		block = NULL;
+	rcu_read_unlock();
+
+	return block;
+}
+
 static void tcf_block_flush_all_chains(struct tcf_block *block)
 {
 	struct tcf_chain *chain;
@@ -562,6 +575,40 @@ static void tcf_block_put_all_chains(struct tcf_block *block)
 	}
 }
 
+static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q,
+			    struct tcf_block_ext_info *ei)
+{
+	if (refcount_dec_and_test(&block->refcnt)) {
+		/* Flushing/putting all chains will cause the block to be
+		 * deallocated when last chain is freed. However, if chain_list
+		 * is empty, block has to be manually deallocated. After block
+		 * reference counter reached 0, it is no longer possible to
+		 * increment it or add new chains to block.
+		 */
+		bool free_block = list_empty(&block->chain_list);
+
+		if (tcf_block_shared(block))
+			tcf_block_remove(block, block->net);
+		if (!free_block)
+			tcf_block_flush_all_chains(block);
+
+		if (q)
+			tcf_block_offload_unbind(block, q, ei);
+
+		if (free_block)
+			kfree_rcu(block, rcu);
+		else
+			tcf_block_put_all_chains(block);
+	} else if (q) {
+		tcf_block_offload_unbind(block, q, ei);
+	}
+}
+
+static void tcf_block_refcnt_put(struct tcf_block *block)
+{
+	__tcf_block_put(block, NULL, NULL);
+}
+
 /* Find tcf block.
  * Set q, parent, cl when appropriate.
  */
@@ -786,7 +833,7 @@ err_block_owner_add:
 		if (tcf_block_shared(block))
 			tcf_block_remove(block, net);
 err_block_insert:
-		kfree(block);
+		kfree_rcu(block, rcu);
 	} else {
 		refcount_dec(&block->refcnt);
 	}
@@ -826,28 +873,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 	tcf_chain0_head_change_cb_del(block, ei);
 	tcf_block_owner_del(block, q, ei->binder_type);
 
-	if (refcount_dec_and_test(&block->refcnt)) {
-		/* Flushing/putting all chains will cause the block to be
-		 * deallocated when last chain is freed. However, if chain_list
-		 * is empty, block has to be manually deallocated. After block
-		 * reference counter reached 0, it is no longer possible to
-		 * increment it or add new chains to block.
-		 */
-		bool free_block = list_empty(&block->chain_list);
-
-		if (tcf_block_shared(block))
-			tcf_block_remove(block, block->net);
-		if (!free_block)
-			tcf_block_flush_all_chains(block);
-		tcf_block_offload_unbind(block, q, ei);
-
-		if (free_block)
-			kfree(block);
-		else
-			tcf_block_put_all_chains(block);
-	} else {
-		tcf_block_offload_unbind(block, q, ei);
-	}
+	__tcf_block_put(block, q, ei);
 }
 EXPORT_SYMBOL(tcf_block_put_ext);
 
-- 
cgit v1.2.3


From cd11d11286cba88aab5b1da1c83ee36e5b5cefb7 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Wed, 26 Sep 2018 18:29:06 +0200
Subject: net/af_iucv: locate IUCV header via skb_network_header()

This patch attempts to untangle the TX and RX code in qeth from
af_iucv's respective HiperTransport path:
On the TX side, pointing skb_network_header() at the IUCV header
means that qeth_l3_fill_af_iucv_hdr() no longer needs a magical offset
to access the header.
On the RX side, qeth pulls the (fake) L2 header off the skb like any
normal ethernet driver would. This makes working with the IUCV header
in af_iucv easier, since we no longer have to assume a fixed skb layout.

While at it, replace the open-coded length checks in af_iucv's RX path
with pskb_may_pull().

Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_l3_main.c | 12 +++++-------
 include/net/iucv/af_iucv.h      |  5 +++++
 net/iucv/af_iucv.c              | 42 +++++++++++++----------------------------
 3 files changed, 23 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 2756795f7708..7148ef71ac78 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -1348,6 +1348,7 @@ static void qeth_l3_rebuild_skb(struct qeth_card *card, struct sk_buff *skb,
 static int qeth_l3_process_inbound_buffer(struct qeth_card *card,
 				int budget, int *done)
 {
+	struct net_device *dev = card->dev;
 	int work_done = 0;
 	struct sk_buff *skb;
 	struct qeth_hdr *hdr;
@@ -1369,11 +1370,10 @@ static int qeth_l3_process_inbound_buffer(struct qeth_card *card,
 			magic = *(__u16 *)skb->data;
 			if ((card->info.type == QETH_CARD_TYPE_IQD) &&
 			    (magic == ETH_P_AF_IUCV)) {
-				skb->protocol = cpu_to_be16(ETH_P_AF_IUCV);
 				len = skb->len;
-				card->dev->header_ops->create(skb, card->dev, 0,
-					card->dev->dev_addr, "FAKELL", len);
-				skb_reset_mac_header(skb);
+				dev_hard_header(skb, dev, ETH_P_AF_IUCV,
+						dev->dev_addr, "FAKELL", len);
+				skb->protocol = eth_type_trans(skb, dev);
 				netif_receive_skb(skb);
 			} else {
 				qeth_l3_rebuild_skb(card, skb, hdr);
@@ -2005,17 +2005,15 @@ static void qeth_l3_fill_af_iucv_hdr(struct qeth_hdr *hdr, struct sk_buff *skb,
 				     unsigned int data_len)
 {
 	char daddr[16];
-	struct af_iucv_trans_hdr *iucv_hdr;
 
 	hdr->hdr.l3.id = QETH_HEADER_TYPE_LAYER3;
 	hdr->hdr.l3.length = data_len;
 	hdr->hdr.l3.flags = QETH_HDR_IPV6 | QETH_CAST_UNICAST;
 
-	iucv_hdr = (struct af_iucv_trans_hdr *)(skb_mac_header(skb) + ETH_HLEN);
 	memset(daddr, 0, sizeof(daddr));
 	daddr[0] = 0xfe;
 	daddr[1] = 0x80;
-	memcpy(&daddr[8], iucv_hdr->destUserID, 8);
+	memcpy(&daddr[8], iucv_trans_hdr(skb)->destUserID, 8);
 	memcpy(hdr->hdr.l3.next_hop.ipv6_addr, daddr, 16);
 }
 
diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h
index f4c21b5a1242..14a490246be9 100644
--- a/include/net/iucv/af_iucv.h
+++ b/include/net/iucv/af_iucv.h
@@ -80,6 +80,11 @@ struct af_iucv_trans_hdr {
 	u8 pad;                          /* total 104 bytes */
 } __packed;
 
+static inline struct af_iucv_trans_hdr *iucv_trans_hdr(struct sk_buff *skb)
+{
+	return (struct af_iucv_trans_hdr *)skb_network_header(skb);
+}
+
 enum iucv_tx_notify {
 	/* transmission of skb is completed and was successful */
 	TX_NOTIFY_OK = 0,
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 5b68ee908107..45115c125569 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -320,13 +320,9 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
 	struct sk_buff *nskb;
 	int err, confirm_recv = 0;
 
-	memset(skb->head, 0, ETH_HLEN);
-	phs_hdr = skb_push(skb, sizeof(struct af_iucv_trans_hdr));
-	skb_reset_mac_header(skb);
+	phs_hdr = skb_push(skb, sizeof(*phs_hdr));
+	memset(phs_hdr, 0, sizeof(*phs_hdr));
 	skb_reset_network_header(skb);
-	skb_push(skb, ETH_HLEN);
-	skb_reset_mac_header(skb);
-	memset(phs_hdr, 0, sizeof(struct af_iucv_trans_hdr));
 
 	phs_hdr->magic = ETH_P_AF_IUCV;
 	phs_hdr->version = 1;
@@ -350,6 +346,9 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
 	if (imsg)
 		memcpy(&phs_hdr->iucv_hdr, imsg, sizeof(struct iucv_message));
 
+	skb_push(skb, ETH_HLEN);
+	memset(skb->data, 0, ETH_HLEN);
+
 	skb->dev = iucv->hs_dev;
 	if (!skb->dev) {
 		err = -ENODEV;
@@ -1943,8 +1942,7 @@ static void iucv_callback_shutdown(struct iucv_path *path, u8 ipuser[16])
 /***************** HiperSockets transport callbacks ********************/
 static void afiucv_swap_src_dest(struct sk_buff *skb)
 {
-	struct af_iucv_trans_hdr *trans_hdr =
-				(struct af_iucv_trans_hdr *)skb->data;
+	struct af_iucv_trans_hdr *trans_hdr = iucv_trans_hdr(skb);
 	char tmpID[8];
 	char tmpName[8];
 
@@ -1967,13 +1965,12 @@ static void afiucv_swap_src_dest(struct sk_buff *skb)
  **/
 static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
 {
+	struct af_iucv_trans_hdr *trans_hdr = iucv_trans_hdr(skb);
 	struct sock *nsk;
 	struct iucv_sock *iucv, *niucv;
-	struct af_iucv_trans_hdr *trans_hdr;
 	int err;
 
 	iucv = iucv_sk(sk);
-	trans_hdr = (struct af_iucv_trans_hdr *)skb->data;
 	if (!iucv) {
 		/* no sock - connection refused */
 		afiucv_swap_src_dest(skb);
@@ -2034,15 +2031,13 @@ out:
 static int afiucv_hs_callback_synack(struct sock *sk, struct sk_buff *skb)
 {
 	struct iucv_sock *iucv = iucv_sk(sk);
-	struct af_iucv_trans_hdr *trans_hdr =
-					(struct af_iucv_trans_hdr *)skb->data;
 
 	if (!iucv)
 		goto out;
 	if (sk->sk_state != IUCV_BOUND)
 		goto out;
 	bh_lock_sock(sk);
-	iucv->msglimit_peer = trans_hdr->window;
+	iucv->msglimit_peer = iucv_trans_hdr(skb)->window;
 	sk->sk_state = IUCV_CONNECTED;
 	sk->sk_state_change(sk);
 	bh_unlock_sock(sk);
@@ -2098,8 +2093,6 @@ out:
 static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb)
 {
 	struct iucv_sock *iucv = iucv_sk(sk);
-	struct af_iucv_trans_hdr *trans_hdr =
-					(struct af_iucv_trans_hdr *)skb->data;
 
 	if (!iucv)
 		return NET_RX_SUCCESS;
@@ -2107,7 +2100,7 @@ static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb)
 	if (sk->sk_state != IUCV_CONNECTED)
 		return NET_RX_SUCCESS;
 
-	atomic_sub(trans_hdr->window, &iucv->msg_sent);
+	atomic_sub(iucv_trans_hdr(skb)->window, &iucv->msg_sent);
 	iucv_sock_wake_msglim(sk);
 	return NET_RX_SUCCESS;
 }
@@ -2170,22 +2163,13 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
 	int err = NET_RX_SUCCESS;
 	char nullstring[8];
 
-	if (skb->len < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr))) {
-		WARN_ONCE(1, "AF_IUCV too short skb, len=%d, min=%d",
-			  (int)skb->len,
-			  (int)(ETH_HLEN + sizeof(struct af_iucv_trans_hdr)));
+	if (!pskb_may_pull(skb, sizeof(*trans_hdr))) {
+		WARN_ONCE(1, "AF_IUCV failed to receive skb, len=%u", skb->len);
 		kfree_skb(skb);
 		return NET_RX_SUCCESS;
 	}
-	if (skb_headlen(skb) < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr)))
-		if (skb_linearize(skb)) {
-			WARN_ONCE(1, "AF_IUCV skb_linearize failed, len=%d",
-				  (int)skb->len);
-			kfree_skb(skb);
-			return NET_RX_SUCCESS;
-		}
-	skb_pull(skb, ETH_HLEN);
-	trans_hdr = (struct af_iucv_trans_hdr *)skb->data;
+
+	trans_hdr = iucv_trans_hdr(skb);
 	EBCASC(trans_hdr->destAppName, sizeof(trans_hdr->destAppName));
 	EBCASC(trans_hdr->destUserID, sizeof(trans_hdr->destUserID));
 	EBCASC(trans_hdr->srcAppName, sizeof(trans_hdr->srcAppName));
-- 
cgit v1.2.3


From af4325ecc24f45933d5567e72227cff2c1594764 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Mon, 24 Sep 2018 20:57:29 +0800
Subject: tcp: expose sk_state in tcp_retransmit_skb tracepoint

After sk_state exposed, we can get in which state this retransmission
occurs. That could give us more detail for dignostic.
For example, if this retransmission occurs in SYN_SENT state, it may
also indicates that the syn packet may be dropped on the remote peer due
to syn backlog queue full and then we could check the remote peer.

BTW,SYNACK retransmission is traced in tcp_retransmit_synack tracepoint.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index ac55b328d61b..2bc9960a31aa 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -56,6 +56,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 	TP_STRUCT__entry(
 		__field(const void *, skbaddr)
 		__field(const void *, skaddr)
+		__field(int, state)
 		__field(__u16, sport)
 		__field(__u16, dport)
 		__array(__u8, saddr, 4)
@@ -70,6 +71,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 
 		__entry->skbaddr = skb;
 		__entry->skaddr = sk;
+		__entry->state = sk->sk_state;
 
 		__entry->sport = ntohs(inet->inet_sport);
 		__entry->dport = ntohs(inet->inet_dport);
@@ -84,9 +86,10 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 			      sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
-	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
+	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s\n",
 		  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
-		  __entry->saddr_v6, __entry->daddr_v6)
+		  __entry->saddr_v6, __entry->daddr_v6,
+		  show_tcp_state_name(__entry->state))
 );
 
 DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
-- 
cgit v1.2.3


From d888f39666774c7debfa34e4e20ba33cf61a6d71 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 25 Sep 2018 20:56:26 -0700
Subject: net-ipv4: remove 2 always zero parameters from ipv4_update_pmtu()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(the parameters in question are mark and flow_flags)

Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h             | 2 +-
 net/ipv4/ah4.c                  | 2 +-
 net/ipv4/esp4.c                 | 2 +-
 net/ipv4/icmp.c                 | 2 +-
 net/ipv4/ip_gre.c               | 2 +-
 net/ipv4/ip_vti.c               | 2 +-
 net/ipv4/ipcomp.c               | 2 +-
 net/ipv4/ipip.c                 | 3 +--
 net/ipv4/route.c                | 8 +++-----
 net/ipv6/sit.c                  | 2 +-
 net/netfilter/ipvs/ip_vs_core.c | 3 +--
 net/xfrm/xfrm_interface.c       | 2 +-
 12 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index bb53cdba38dc..73c605bdd6d8 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -201,7 +201,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
 }
 
 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif,
-		      u32 mark, u8 protocol, int flow_flags);
+		      u8 protocol);
 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu);
 void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
 		   u8 protocol, int flow_flags);
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 4dd95cdd8070..8811fe30282a 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -461,7 +461,7 @@ static int ah4_err(struct sk_buff *skb, u32 info)
 		return 0;
 
 	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
-		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH);
 	else
 		ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
 	xfrm_state_put(x);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 97689012b357..2d0274441923 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -820,7 +820,7 @@ static int esp4_err(struct sk_buff *skb, u32 info)
 		return 0;
 
 	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
-		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ESP);
 	else
 		ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
 	xfrm_state_put(x);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 695979b7ef6d..8013b37b598f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1098,7 +1098,7 @@ void icmp_err(struct sk_buff *skb, u32 info)
 	}
 
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
-		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
+		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
 	else if (type == ICMP_REDIRECT)
 		ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
 }
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index c3385a84f8ff..83b80fafd8f2 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -239,7 +239,7 @@ static void gre_err(struct sk_buff *skb, u32 info)
 
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
-				 skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+				 skb->dev->ifindex, IPPROTO_GRE);
 		return;
 	}
 	if (type == ICMP_REDIRECT) {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index f38cb21d773d..1b5571cb3282 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -318,7 +318,7 @@ static int vti4_err(struct sk_buff *skb, u32 info)
 		return 0;
 
 	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
-		ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+		ipv4_update_pmtu(skb, net, info, 0, protocol);
 	else
 		ipv4_redirect(skb, net, 0, 0, protocol, 0);
 	xfrm_state_put(x);
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index d97f4f2787f5..04049d1330a2 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -48,7 +48,7 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info)
 		return 0;
 
 	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
-		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
+		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP);
 	else
 		ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
 	xfrm_state_put(x);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c891235b4966..6ff008e5818d 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -175,8 +175,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 	}
 
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
-		ipv4_update_pmtu(skb, net, info, t->parms.link, 0,
-				 iph->protocol, 0);
+		ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
 		goto out;
 	}
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b678466da451..7bbe3fc80b90 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1040,17 +1040,15 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 }
 
 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
-		      int oif, u32 mark, u8 protocol, int flow_flags)
+		      int oif, u8 protocol)
 {
 	const struct iphdr *iph = (const struct iphdr *) skb->data;
 	struct flowi4 fl4;
 	struct rtable *rt;
-
-	if (!mark)
-		mark = IP4_REPLY_MARK(net, skb->mark);
+	u32 mark = IP4_REPLY_MARK(net, skb->mark);
 
 	__build_flow_key(net, &fl4, NULL, iph, oif,
-			 RT_TOS(iph->tos), protocol, mark, flow_flags);
+			 RT_TOS(iph->tos), protocol, mark, 0);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
 		__ip_rt_update_pmtu(rt, &fl4, mtu);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index e9400ffa7875..085c588ebfe0 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -534,7 +534,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
 
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
-				 t->parms.link, 0, iph->protocol, 0);
+				 t->parms.link, iph->protocol);
 		err = 0;
 		goto out;
 	}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 7ca926a03b81..fe9abf3cc10a 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1686,8 +1686,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
 			skb_reset_network_header(skb);
 			IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
 				&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
-			ipv4_update_pmtu(skb, ipvs->net,
-					 mtu, 0, 0, 0, 0);
+			ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0);
 			/* Client uses PMTUD? */
 			if (!(frag_off & htons(IP_DF)))
 				goto ignore_ipip;
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 31acc6f33d98..16bc5ecb7869 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -469,7 +469,7 @@ static int xfrmi4_err(struct sk_buff *skb, u32 info)
 	}
 
 	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
-		ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+		ipv4_update_pmtu(skb, net, info, 0, protocol);
 	else
 		ipv4_redirect(skb, net, 0, 0, protocol, 0);
 	xfrm_state_put(x);
-- 
cgit v1.2.3


From 1042caa79e9351b81ed19dc8d2d7fd6ff51a4422 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 25 Sep 2018 20:56:27 -0700
Subject: net-ipv4: remove 2 always zero parameters from ipv4_redirect()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(the parameters in question are mark and flow_flags)

Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h       | 3 +--
 net/ipv4/ah4.c            | 2 +-
 net/ipv4/esp4.c           | 2 +-
 net/ipv4/icmp.c           | 2 +-
 net/ipv4/ip_gre.c         | 4 ++--
 net/ipv4/ip_vti.c         | 2 +-
 net/ipv4/ipcomp.c         | 2 +-
 net/ipv4/ipip.c           | 2 +-
 net/ipv4/route.c          | 4 ++--
 net/ipv6/sit.c            | 4 ++--
 net/xfrm/xfrm_interface.c | 2 +-
 11 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index 73c605bdd6d8..9883dc82f723 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -203,8 +203,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif,
 		      u8 protocol);
 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu);
-void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
-		   u8 protocol, int flow_flags);
+void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol);
 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk);
 void ip_rt_send_redirect(struct sk_buff *skb);
 
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 8811fe30282a..c01fa791260d 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -463,7 +463,7 @@ static int ah4_err(struct sk_buff *skb, u32 info)
 	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
 		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH);
 	else
-		ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
+		ipv4_redirect(skb, net, 0, IPPROTO_AH);
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 2d0274441923..071533dd33c2 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -822,7 +822,7 @@ static int esp4_err(struct sk_buff *skb, u32 info)
 	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
 		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ESP);
 	else
-		ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
+		ipv4_redirect(skb, net, 0, IPPROTO_ESP);
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 8013b37b598f..d832beed6e3a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1100,7 +1100,7 @@ void icmp_err(struct sk_buff *skb, u32 info)
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
 		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
 	else if (type == ICMP_REDIRECT)
-		ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
+		ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
 }
 
 /*
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 83b80fafd8f2..38befe829caf 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -243,8 +243,8 @@ static void gre_err(struct sk_buff *skb, u32 info)
 		return;
 	}
 	if (type == ICMP_REDIRECT) {
-		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
-			      IPPROTO_GRE, 0);
+		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
+			      IPPROTO_GRE);
 		return;
 	}
 
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 1b5571cb3282..de31b302d69c 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -320,7 +320,7 @@ static int vti4_err(struct sk_buff *skb, u32 info)
 	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
 		ipv4_update_pmtu(skb, net, info, 0, protocol);
 	else
-		ipv4_redirect(skb, net, 0, 0, protocol, 0);
+		ipv4_redirect(skb, net, 0, protocol);
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 04049d1330a2..9119d012ba46 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -50,7 +50,7 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info)
 	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
 		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP);
 	else
-		ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
+		ipv4_redirect(skb, net, 0, IPPROTO_COMP);
 	xfrm_state_put(x);
 
 	return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 6ff008e5818d..e65287c27e3d 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -180,7 +180,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 	}
 
 	if (type == ICMP_REDIRECT) {
-		ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0);
+		ipv4_redirect(skb, net, t->parms.link, iph->protocol);
 		goto out;
 	}
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7bbe3fc80b90..dce2ed66ebe1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1130,14 +1130,14 @@ out:
 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 
 void ipv4_redirect(struct sk_buff *skb, struct net *net,
-		   int oif, u32 mark, u8 protocol, int flow_flags)
+		   int oif, u8 protocol)
 {
 	const struct iphdr *iph = (const struct iphdr *) skb->data;
 	struct flowi4 fl4;
 	struct rtable *rt;
 
 	__build_flow_key(net, &fl4, NULL, iph, oif,
-			 RT_TOS(iph->tos), protocol, mark, flow_flags);
+			 RT_TOS(iph->tos), protocol, 0, 0);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
 		__ip_do_redirect(rt, skb, &fl4, false);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 085c588ebfe0..51c9f75f34b9 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -539,8 +539,8 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
 		goto out;
 	}
 	if (type == ICMP_REDIRECT) {
-		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
-			      iph->protocol, 0);
+		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link,
+			      iph->protocol);
 		err = 0;
 		goto out;
 	}
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 16bc5ecb7869..4b4ef4f662d9 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -471,7 +471,7 @@ static int xfrmi4_err(struct sk_buff *skb, u32 info)
 	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
 		ipv4_update_pmtu(skb, net, info, 0, protocol);
 	else
-		ipv4_redirect(skb, net, 0, 0, protocol, 0);
+		ipv4_redirect(skb, net, 0, protocol);
 	xfrm_state_put(x);
 
 	return 0;
-- 
cgit v1.2.3


From b950aa88638c52a013504f025e0b8f99bf2dc26e Mon Sep 17 00:00:00 2001
From: Ankit Navik <ankit.p.navik@intel.com>
Date: Fri, 17 Aug 2018 07:29:19 +0530
Subject: Bluetooth: Add definitions and track LE resolve list modification

Add the definitions for adding entries to the LE resolve list and
removing entries from the LE resolve list. When the LE resolve list
gets changed via HCI commands make sure that the internal storage of
the resolve list entries gets updated.

Signed-off-by: Ankit Navik <ankit.p.navik@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h      | 14 +++++++++
 include/net/bluetooth/hci_core.h | 15 ++++++++++
 net/bluetooth/hci_core.c         | 63 ++++++++++++++++++++++++++++++++++++++++
 net/bluetooth/hci_event.c        | 47 ++++++++++++++++++++++++++++++
 4 files changed, 139 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index cdd9f1fe7cfa..c36dc1e20556 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -1517,6 +1517,20 @@ struct hci_cp_le_write_def_data_len {
 	__le16	tx_time;
 } __packed;
 
+#define HCI_OP_LE_ADD_TO_RESOLV_LIST	0x2027
+struct hci_cp_le_add_to_resolv_list {
+	__u8	 bdaddr_type;
+	bdaddr_t bdaddr;
+	__u8	 peer_irk[16];
+	__u8	 local_irk[16];
+} __packed;
+
+#define HCI_OP_LE_DEL_FROM_RESOLV_LIST	0x2028
+struct hci_cp_le_del_from_resolv_list {
+	__u8	 bdaddr_type;
+	bdaddr_t bdaddr;
+} __packed;
+
 #define HCI_OP_LE_CLEAR_RESOLV_LIST	0x2029
 
 #define HCI_OP_LE_READ_RESOLV_LIST_SIZE	0x202a
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 0db1b9b428b7..9b0f821b2d3a 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -103,6 +103,14 @@ struct bdaddr_list {
 	u8 bdaddr_type;
 };
 
+struct bdaddr_list_with_irk {
+	struct list_head list;
+	bdaddr_t bdaddr;
+	u8 bdaddr_type;
+	u8 peer_irk[16];
+	u8 local_irk[16];
+};
+
 struct bt_uuid {
 	struct list_head list;
 	u8 uuid[16];
@@ -1058,8 +1066,15 @@ int hci_inquiry(void __user *arg);
 
 struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *list,
 					   bdaddr_t *bdaddr, u8 type);
+struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk(
+				    struct list_head *list, bdaddr_t *bdaddr,
+				    u8 type);
 int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type);
+int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr,
+					u8 type, u8 *peer_irk, u8 *local_irk);
 int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type);
+int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr,
+								u8 type);
 void hci_bdaddr_list_clear(struct list_head *list);
 
 struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 74b29c7d841c..0f1a8820d75c 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2839,6 +2839,20 @@ struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list,
 	return NULL;
 }
 
+struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk(
+				struct list_head *bdaddr_list, bdaddr_t *bdaddr,
+				u8 type)
+{
+	struct bdaddr_list_with_irk *b;
+
+	list_for_each_entry(b, bdaddr_list, list) {
+		if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type)
+			return b;
+	}
+
+	return NULL;
+}
+
 void hci_bdaddr_list_clear(struct list_head *bdaddr_list)
 {
 	struct bdaddr_list *b, *n;
@@ -2871,6 +2885,35 @@ int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type)
 	return 0;
 }
 
+int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr,
+					u8 type, u8 *peer_irk, u8 *local_irk)
+{
+	struct bdaddr_list_with_irk *entry;
+
+	if (!bacmp(bdaddr, BDADDR_ANY))
+		return -EBADF;
+
+	if (hci_bdaddr_list_lookup(list, bdaddr, type))
+		return -EEXIST;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	bacpy(&entry->bdaddr, bdaddr);
+	entry->bdaddr_type = type;
+
+	if (peer_irk)
+		memcpy(entry->peer_irk, peer_irk, 16);
+
+	if (local_irk)
+		memcpy(entry->local_irk, local_irk, 16);
+
+	list_add(&entry->list, list);
+
+	return 0;
+}
+
 int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type)
 {
 	struct bdaddr_list *entry;
@@ -2890,6 +2933,26 @@ int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type)
 	return 0;
 }
 
+int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr,
+							u8 type)
+{
+	struct bdaddr_list_with_irk *entry;
+
+	if (!bacmp(bdaddr, BDADDR_ANY)) {
+		hci_bdaddr_list_clear(list);
+		return 0;
+	}
+
+	entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type);
+	if (!entry)
+		return -ENOENT;
+
+	list_del(&entry->list);
+	kfree(entry);
+
+	return 0;
+}
+
 /* This function requires the caller holds hdev->lock */
 struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
 					       bdaddr_t *addr, u8 addr_type)
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index f12555f23a49..f47f8fad757a 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1454,6 +1454,45 @@ static void hci_cc_le_write_def_data_len(struct hci_dev *hdev,
 	hdev->le_def_tx_time = le16_to_cpu(sent->tx_time);
 }
 
+static void hci_cc_le_add_to_resolv_list(struct hci_dev *hdev,
+					 struct sk_buff *skb)
+{
+	struct hci_cp_le_add_to_resolv_list *sent;
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+	if (status)
+		return;
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST);
+	if (!sent)
+		return;
+
+	hci_bdaddr_list_add_with_irk(&hdev->le_resolv_list, &sent->bdaddr,
+				sent->bdaddr_type, sent->peer_irk,
+				sent->local_irk);
+}
+
+static void hci_cc_le_del_from_resolv_list(struct hci_dev *hdev,
+					  struct sk_buff *skb)
+{
+	struct hci_cp_le_del_from_resolv_list *sent;
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+	if (status)
+		return;
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST);
+	if (!sent)
+		return;
+
+	hci_bdaddr_list_del_with_irk(&hdev->le_resolv_list, &sent->bdaddr,
+			    sent->bdaddr_type);
+}
+
 static void hci_cc_le_clear_resolv_list(struct hci_dev *hdev,
 				       struct sk_buff *skb)
 {
@@ -3279,6 +3318,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
 		hci_cc_le_write_def_data_len(hdev, skb);
 		break;
 
+	case HCI_OP_LE_ADD_TO_RESOLV_LIST:
+		hci_cc_le_add_to_resolv_list(hdev, skb);
+		break;
+
+	case HCI_OP_LE_DEL_FROM_RESOLV_LIST:
+		hci_cc_le_del_from_resolv_list(hdev, skb);
+		break;
+
 	case HCI_OP_LE_CLEAR_RESOLV_LIST:
 		hci_cc_le_clear_resolv_list(hdev, skb);
 		break;
-- 
cgit v1.2.3


From fe1493101ac1313cbdbef1af65342fb17d944e71 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 4 Sep 2018 13:39:20 +0300
Subject: Bluetooth: L2CAP: Derive MPS from connection MTU

This ensures the MPS can fit in a single HCI fragment so each
segment don't have to be reassembled at HCI level, in addition to
that also remove the debugfs entry to configure the MPS.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/l2cap.h |  1 -
 net/bluetooth/l2cap_core.c    | 14 +++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 0697fd413087..17296675a0b1 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -456,7 +456,6 @@ struct l2cap_conn_param_update_rsp {
 #define L2CAP_CONN_PARAM_REJECTED	0x0001
 
 #define L2CAP_LE_MAX_CREDITS		10
-#define L2CAP_LE_DEFAULT_MPS		230
 
 struct l2cap_le_conn_req {
 	__le16     psm;
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index d17a4736e47c..8a60db6261f7 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -52,7 +52,6 @@ static LIST_HEAD(chan_list);
 static DEFINE_RWLOCK(chan_list_lock);
 
 static u16 le_max_credits = L2CAP_LE_MAX_CREDITS;
-static u16 le_default_mps = L2CAP_LE_DEFAULT_MPS;
 
 static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
 				       u8 code, u8 ident, u16 dlen, void *data);
@@ -520,7 +519,8 @@ static void l2cap_le_flowctl_init(struct l2cap_chan *chan)
 	chan->sdu_len = 0;
 	chan->tx_credits = 0;
 	chan->rx_credits = le_max_credits;
-	chan->mps = min_t(u16, chan->imtu, le_default_mps);
+	/* Derive MPS from connection MTU to stop HCI fragmentation */
+	chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE);
 
 	skb_queue_head_init(&chan->tx_q);
 }
@@ -1282,6 +1282,8 @@ static void l2cap_le_connect(struct l2cap_chan *chan)
 	if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags))
 		return;
 
+	l2cap_le_flowctl_init(chan);
+
 	req.psm     = chan->psm;
 	req.scid    = cpu_to_le16(chan->scid);
 	req.mtu     = cpu_to_le16(chan->imtu);
@@ -5493,8 +5495,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
 		goto response_unlock;
 	}
 
-	l2cap_le_flowctl_init(chan);
-
 	bacpy(&chan->src, &conn->hcon->src);
 	bacpy(&chan->dst, &conn->hcon->dst);
 	chan->src_type = bdaddr_src_type(conn->hcon);
@@ -5506,6 +5506,9 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
 	chan->tx_credits = __le16_to_cpu(req->credits);
 
 	__l2cap_chan_add(conn, chan);
+
+	l2cap_le_flowctl_init(chan);
+
 	dcid = chan->scid;
 	credits = chan->rx_credits;
 
@@ -7102,7 +7105,6 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
 	case L2CAP_MODE_BASIC:
 		break;
 	case L2CAP_MODE_LE_FLOWCTL:
-		l2cap_le_flowctl_init(chan);
 		break;
 	case L2CAP_MODE_ERTM:
 	case L2CAP_MODE_STREAMING:
@@ -7647,8 +7649,6 @@ int __init l2cap_init(void)
 
 	debugfs_create_u16("l2cap_le_max_credits", 0644, bt_debugfs,
 			   &le_max_credits);
-	debugfs_create_u16("l2cap_le_default_mps", 0644, bt_debugfs,
-			   &le_default_mps);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 96cd8eaa131f0ffd4cfae09e1b4bdfafb9570907 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 4 Sep 2018 13:39:21 +0300
Subject: Bluetooth: L2CAP: Derive rx credits from MTU and MPS

Give enough rx credits for a full packet instead of using an arbitrary
number which may not be enough depending on the MTU and MPS which can
cause interruptions while waiting for more credits, also remove
debugfs entry for l2cap_le_max_credits.

With these changes the credits are restored after each SDU is received
instead of using fixed threshold, this way it is garanteed that there
will always be enough credits to send a packet without waiting more
credits to arrive.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/l2cap.h |  2 --
 net/bluetooth/l2cap_core.c    | 42 +++++++++++++++++++++++++++---------------
 2 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 17296675a0b1..3555440e14fc 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -455,8 +455,6 @@ struct l2cap_conn_param_update_rsp {
 #define L2CAP_CONN_PARAM_ACCEPTED	0x0000
 #define L2CAP_CONN_PARAM_REJECTED	0x0001
 
-#define L2CAP_LE_MAX_CREDITS		10
-
 struct l2cap_le_conn_req {
 	__le16     psm;
 	__le16     scid;
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 8a60db6261f7..3fb2d757df88 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -51,8 +51,6 @@ static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD;
 static LIST_HEAD(chan_list);
 static DEFINE_RWLOCK(chan_list_lock);
 
-static u16 le_max_credits = L2CAP_LE_MAX_CREDITS;
-
 static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
 				       u8 code, u8 ident, u16 dlen, void *data);
 static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
@@ -518,9 +516,10 @@ static void l2cap_le_flowctl_init(struct l2cap_chan *chan)
 	chan->sdu_last_frag = NULL;
 	chan->sdu_len = 0;
 	chan->tx_credits = 0;
-	chan->rx_credits = le_max_credits;
 	/* Derive MPS from connection MTU to stop HCI fragmentation */
 	chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE);
+	/* Give enough credits for a full packet */
+	chan->rx_credits = (chan->imtu / chan->mps) + 1;
 
 	skb_queue_head_init(&chan->tx_q);
 }
@@ -6702,13 +6701,10 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
 	struct l2cap_le_credits pkt;
 	u16 return_credits;
 
-	/* We return more credits to the sender only after the amount of
-	 * credits falls below half of the initial amount.
-	 */
-	if (chan->rx_credits >= (le_max_credits + 1) / 2)
-		return;
+	return_credits = ((chan->imtu / chan->mps) + 1) - chan->rx_credits;
 
-	return_credits = le_max_credits - chan->rx_credits;
+	if (!return_credits)
+		return;
 
 	BT_DBG("chan %p returning %u credits to sender", chan, return_credits);
 
@@ -6722,6 +6718,21 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
 	l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt);
 }
 
+static int l2cap_le_recv(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+	int err;
+
+	BT_DBG("SDU reassemble complete: chan %p skb->len %u", chan, skb->len);
+
+	/* Wait recv to confirm reception before updating the credits */
+	err = chan->ops->recv(chan, skb);
+
+	/* Update credits whenever an SDU is received */
+	l2cap_chan_le_send_credits(chan);
+
+	return err;
+}
+
 static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
 {
 	int err;
@@ -6740,7 +6751,11 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
 	chan->rx_credits--;
 	BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits);
 
-	l2cap_chan_le_send_credits(chan);
+	/* Update if remote had run out of credits, this should only happens
+	 * if the remote is not using the entire MPS.
+	 */
+	if (!chan->rx_credits)
+		l2cap_chan_le_send_credits(chan);
 
 	err = 0;
 
@@ -6766,7 +6781,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
 		}
 
 		if (skb->len == sdu_len)
-			return chan->ops->recv(chan, skb);
+			return l2cap_le_recv(chan, skb);
 
 		chan->sdu = skb;
 		chan->sdu_len = sdu_len;
@@ -6788,7 +6803,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
 	skb = NULL;
 
 	if (chan->sdu->len == chan->sdu_len) {
-		err = chan->ops->recv(chan, chan->sdu);
+		err = l2cap_le_recv(chan, chan->sdu);
 		if (!err) {
 			chan->sdu = NULL;
 			chan->sdu_last_frag = NULL;
@@ -7647,9 +7662,6 @@ int __init l2cap_init(void)
 	l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs,
 					    NULL, &l2cap_debugfs_fops);
 
-	debugfs_create_u16("l2cap_le_max_credits", 0644, bt_debugfs,
-			   &le_max_credits);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From fb961945457f5177072c968aa38fee910ab893b9 Mon Sep 17 00:00:00 2001
From: Christian Göttsche <cgzones@googlemail.com>
Date: Sun, 23 Sep 2018 20:26:15 +0200
Subject: netfilter: nf_tables: add SECMARK support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the ability to set the security context of packets within the nf_tables framework.
Add a nft_object for holding security contexts in the kernel and manipulating packets on the wire.

Convert the security context strings at rule addition time to security identifiers.
This is the same behavior like in xt_SECMARK and offers better performance than computing it per packet.

Set the maximum security context length to 256.

Signed-off-by: Christian Göttsche <cgzones@googlemail.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h   |   4 ++
 include/uapi/linux/netfilter/nf_tables.h |  18 +++++-
 net/netfilter/nf_tables_core.c           |  28 ++++++--
 net/netfilter/nft_meta.c                 | 108 +++++++++++++++++++++++++++++++
 4 files changed, 153 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 8da837d2aaf9..2046d104f323 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -16,6 +16,10 @@ extern struct nft_expr_type nft_meta_type;
 extern struct nft_expr_type nft_rt_type;
 extern struct nft_expr_type nft_exthdr_type;
 
+#ifdef CONFIG_NETWORK_SECMARK
+extern struct nft_object_type nft_secmark_obj_type;
+#endif
+
 int nf_tables_core_module_init(void);
 void nf_tables_core_module_exit(void);
 
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 702e4f0bec56..5444e76870bb 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1176,6 +1176,21 @@ enum nft_quota_attributes {
 };
 #define NFTA_QUOTA_MAX		(__NFTA_QUOTA_MAX - 1)
 
+/**
+ * enum nft_secmark_attributes - nf_tables secmark object netlink attributes
+ *
+ * @NFTA_SECMARK_CTX: security context (NLA_STRING)
+ */
+enum nft_secmark_attributes {
+	NFTA_SECMARK_UNSPEC,
+	NFTA_SECMARK_CTX,
+	__NFTA_SECMARK_MAX,
+};
+#define NFTA_SECMARK_MAX	(__NFTA_SECMARK_MAX - 1)
+
+/* Max security context length */
+#define NFT_SECMARK_CTX_MAXLEN		256
+
 /**
  * enum nft_reject_types - nf_tables reject expression reject types
  *
@@ -1432,7 +1447,8 @@ enum nft_ct_timeout_timeout_attributes {
 #define NFT_OBJECT_CONNLIMIT	5
 #define NFT_OBJECT_TUNNEL	6
 #define NFT_OBJECT_CT_TIMEOUT	7
-#define __NFT_OBJECT_MAX	8
+#define NFT_OBJECT_SECMARK	8
+#define __NFT_OBJECT_MAX	9
 #define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index ffd5c0f9412b..3fbce3b9c5ec 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -249,12 +249,24 @@ static struct nft_expr_type *nft_basic_types[] = {
 	&nft_exthdr_type,
 };
 
+static struct nft_object_type *nft_basic_objects[] = {
+#ifdef CONFIG_NETWORK_SECMARK
+	&nft_secmark_obj_type,
+#endif
+};
+
 int __init nf_tables_core_module_init(void)
 {
-	int err, i;
+	int err, i, j = 0;
+
+	for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) {
+		err = nft_register_obj(nft_basic_objects[i]);
+		if (err)
+			goto err;
+	}
 
-	for (i = 0; i < ARRAY_SIZE(nft_basic_types); i++) {
-		err = nft_register_expr(nft_basic_types[i]);
+	for (j = 0; j < ARRAY_SIZE(nft_basic_types); j++) {
+		err = nft_register_expr(nft_basic_types[j]);
 		if (err)
 			goto err;
 	}
@@ -262,8 +274,12 @@ int __init nf_tables_core_module_init(void)
 	return 0;
 
 err:
+	while (j-- > 0)
+		nft_unregister_expr(nft_basic_types[j]);
+
 	while (i-- > 0)
-		nft_unregister_expr(nft_basic_types[i]);
+		nft_unregister_obj(nft_basic_objects[i]);
+
 	return err;
 }
 
@@ -274,4 +290,8 @@ void nf_tables_core_module_exit(void)
 	i = ARRAY_SIZE(nft_basic_types);
 	while (i-- > 0)
 		nft_unregister_expr(nft_basic_types[i]);
+
+	i = ARRAY_SIZE(nft_basic_objects);
+	while (i-- > 0)
+		nft_unregister_obj(nft_basic_objects[i]);
 }
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 297fe7d97c18..91fd6e677ad7 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -543,3 +543,111 @@ struct nft_expr_type nft_meta_type __read_mostly = {
 	.maxattr	= NFTA_META_MAX,
 	.owner		= THIS_MODULE,
 };
+
+#ifdef CONFIG_NETWORK_SECMARK
+struct nft_secmark {
+	u32 secid;
+	char *ctx;
+};
+
+static const struct nla_policy nft_secmark_policy[NFTA_SECMARK_MAX + 1] = {
+	[NFTA_SECMARK_CTX]     = { .type = NLA_STRING, .len = NFT_SECMARK_CTX_MAXLEN },
+};
+
+static int nft_secmark_compute_secid(struct nft_secmark *priv)
+{
+	u32 tmp_secid = 0;
+	int err;
+
+	err = security_secctx_to_secid(priv->ctx, strlen(priv->ctx), &tmp_secid);
+	if (err)
+		return err;
+
+	if (!tmp_secid)
+		return -ENOENT;
+
+	err = security_secmark_relabel_packet(tmp_secid);
+	if (err)
+		return err;
+
+	priv->secid = tmp_secid;
+	return 0;
+}
+
+static void nft_secmark_obj_eval(struct nft_object *obj, struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
+{
+	const struct nft_secmark *priv = nft_obj_data(obj);
+	struct sk_buff *skb = pkt->skb;
+
+	skb->secmark = priv->secid;
+}
+
+static int nft_secmark_obj_init(const struct nft_ctx *ctx,
+				const struct nlattr * const tb[],
+				struct nft_object *obj)
+{
+	struct nft_secmark *priv = nft_obj_data(obj);
+	int err;
+
+	if (tb[NFTA_SECMARK_CTX] == NULL)
+		return -EINVAL;
+
+	priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL);
+	if (!priv->ctx)
+		return -ENOMEM;
+
+	err = nft_secmark_compute_secid(priv);
+	if (err) {
+		kfree(priv->ctx);
+		return err;
+	}
+
+	security_secmark_refcount_inc();
+
+	return 0;
+}
+
+static int nft_secmark_obj_dump(struct sk_buff *skb, struct nft_object *obj,
+				bool reset)
+{
+	struct nft_secmark *priv = nft_obj_data(obj);
+	int err;
+
+	if (nla_put_string(skb, NFTA_SECMARK_CTX, priv->ctx))
+		return -1;
+
+	if (reset) {
+		err = nft_secmark_compute_secid(priv);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void nft_secmark_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
+{
+	struct nft_secmark *priv = nft_obj_data(obj);
+
+	security_secmark_refcount_dec();
+
+	kfree(priv->ctx);
+}
+
+static const struct nft_object_ops nft_secmark_obj_ops = {
+	.type		= &nft_secmark_obj_type,
+	.size		= sizeof(struct nft_secmark),
+	.init		= nft_secmark_obj_init,
+	.eval		= nft_secmark_obj_eval,
+	.dump		= nft_secmark_obj_dump,
+	.destroy	= nft_secmark_obj_destroy,
+};
+struct nft_object_type nft_secmark_obj_type __read_mostly = {
+	.type		= NFT_OBJECT_SECMARK,
+	.ops		= &nft_secmark_obj_ops,
+	.maxattr	= NFTA_SECMARK_MAX,
+	.policy		= nft_secmark_policy,
+	.owner		= THIS_MODULE,
+};
+#endif /* CONFIG_NETWORK_SECMARK */
-- 
cgit v1.2.3


From fe3b30ddb90face841b2ede3b73ed2e9cfece6ba Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 26 Sep 2018 11:15:30 +0200
Subject: netlink: remove NLA_NESTED_COMPAT

This isn't used anywhere, so we might as well get rid of it.

Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h |  2 --
 lib/nlattr.c          | 11 -----------
 2 files changed, 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 318b1ded3833..b680fe365e91 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -172,7 +172,6 @@ enum {
 	NLA_FLAG,
 	NLA_MSECS,
 	NLA_NESTED,
-	NLA_NESTED_COMPAT,
 	NLA_NUL_STRING,
 	NLA_BINARY,
 	NLA_S8,
@@ -203,7 +202,6 @@ enum {
  *    NLA_BINARY           Maximum length of attribute payload
  *    NLA_NESTED           Don't use `len' field -- length verification is
  *                         done by checking len of nested header (or empty)
- *    NLA_NESTED_COMPAT    Minimum length of structure payload
  *    NLA_U8, NLA_U16,
  *    NLA_U32, NLA_U64,
  *    NLA_S8, NLA_S16,
diff --git a/lib/nlattr.c b/lib/nlattr.c
index bb6fe5ed4ecf..120ad569e13d 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -140,17 +140,6 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 			return -ERANGE;
 		break;
 
-	case NLA_NESTED_COMPAT:
-		if (attrlen < pt->len)
-			return -ERANGE;
-		if (attrlen < NLA_ALIGN(pt->len))
-			break;
-		if (attrlen < NLA_ALIGN(pt->len) + NLA_HDRLEN)
-			return -ERANGE;
-		nla = nla_data(nla) + NLA_ALIGN(pt->len);
-		if (attrlen < NLA_ALIGN(pt->len) + NLA_HDRLEN + nla_len(nla))
-			return -ERANGE;
-		break;
 	case NLA_NESTED:
 		/* a nested attributes is allowed to be empty; if its not,
 		 * it must have a size of at least NLA_HDRLEN.
-- 
cgit v1.2.3


From 48fde90a78f8c67e2bec5061f9725fe363519feb Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 26 Sep 2018 11:15:31 +0200
Subject: netlink: make validation_data const

The validation data is only used within the policy that
should usually already be const, and isn't changed in any
code that uses it. Therefore, make the validation_data
pointer const.

While at it, remove the duplicate variable in the bitfield
validation that I'd otherwise have to change to const.

Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 2 +-
 lib/nlattr.c          | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index b680fe365e91..0d698215d4d9 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -237,7 +237,7 @@ enum {
 struct nla_policy {
 	u16		type;
 	u16		len;
-	void            *validation_data;
+	const void     *validation_data;
 };
 
 #define NLA_POLICY_EXACT_LEN(_len)	{ .type = NLA_EXACT_LEN, .len = _len }
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 120ad569e13d..e2e5b38394d5 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -45,12 +45,11 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
 };
 
 static int validate_nla_bitfield32(const struct nlattr *nla,
-				   u32 *valid_flags_allowed)
+				   const u32 *valid_flags_mask)
 {
 	const struct nla_bitfield32 *bf = nla_data(nla);
-	u32 *valid_flags_mask = valid_flags_allowed;
 
-	if (!valid_flags_allowed)
+	if (!valid_flags_mask)
 		return -EINVAL;
 
 	/*disallow invalid bit selector */
-- 
cgit v1.2.3


From 9a659a35ba177cec30676e170fb6ed98157bcb0d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 26 Sep 2018 11:15:33 +0200
Subject: netlink: allow NLA_NESTED to specify nested policy to validate

Now that we have a validation_data pointer, and the len field in
the policy is unused for NLA_NESTED, we can allow using them both
to have nested validation. This can be nice in code, although we
still have to use nla_parse_nested() or similar which would also
take a policy; however, it also serves as documentation in the
policy without requiring a look at the code.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 13 +++++++++++--
 lib/nlattr.c          | 14 ++++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 0d698215d4d9..91907852da1c 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -200,8 +200,10 @@ enum {
  *    NLA_NUL_STRING       Maximum length of string (excluding NUL)
  *    NLA_FLAG             Unused
  *    NLA_BINARY           Maximum length of attribute payload
- *    NLA_NESTED           Don't use `len' field -- length verification is
- *                         done by checking len of nested header (or empty)
+ *    NLA_NESTED           Length verification is done by checking len of
+ *                         nested header (or empty); len field is used if
+ *                         validation_data is also used, for the max attr
+ *                         number in the nested policy.
  *    NLA_U8, NLA_U16,
  *    NLA_U32, NLA_U64,
  *    NLA_S8, NLA_S16,
@@ -224,6 +226,10 @@ enum {
  *    NLA_REJECT           This attribute is always rejected and validation data
  *                         may point to a string to report as the error instead
  *                         of the generic one in extended ACK.
+ *    NLA_NESTED           Points to a nested policy to validate, must also set
+ *                         `len' to the max attribute number.
+ *                         Note that nla_parse() will validate, but of course not
+ *                         parse, the nested sub-policies.
  *    All other            Unused
  *
  * Example:
@@ -247,6 +253,9 @@ struct nla_policy {
 #define NLA_POLICY_ETH_ADDR		NLA_POLICY_EXACT_LEN(ETH_ALEN)
 #define NLA_POLICY_ETH_ADDR_COMPAT	NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)
 
+#define NLA_POLICY_NESTED(maxattr, policy) \
+	{ .type = NLA_NESTED, .validation_data = policy, .len = maxattr }
+
 /**
  * struct nl_info - netlink source information
  * @nlh: Netlink message header of original request
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 6e03d650bec4..04750f88477c 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -155,6 +155,20 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 		 */
 		if (attrlen == 0)
 			break;
+		if (attrlen < NLA_HDRLEN)
+			goto out_err;
+		if (pt->validation_data) {
+			err = nla_validate(nla_data(nla), nla_len(nla), pt->len,
+					   pt->validation_data, extack);
+			if (err < 0) {
+				/*
+				 * return directly to preserve the inner
+				 * error message/attribute pointer
+				 */
+				return err;
+			}
+		}
+		break;
 	default:
 		if (pt->len)
 			minlen = pt->len;
-- 
cgit v1.2.3


From 1501d13596b92d6d1f0ea5e104be838188b6e026 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 26 Sep 2018 11:15:34 +0200
Subject: netlink: add nested array policy validation

Sometimes nested netlink attributes are just used as arrays, with
the nla_type() of each not being used; we have this in nl80211 and
e.g. NFTA_SET_ELEM_LIST_ELEMENTS.

Add the ability to validate this type of message directly in the
policy, by adding the type NLA_NESTED_ARRAY which does exactly
this: require a first level of nesting but ignore the attribute
type, and then inside each require a second level of nested and
validate those attributes against a given policy (if present).

Note that some nested array types actually require that all of
the entries have the same index, this is possible to express in
a nested policy already, apart from the validation that only the
one allowed type is used.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 12 +++++++++++-
 lib/nlattr.c          | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 91907852da1c..3698ca8ff92c 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -172,6 +172,7 @@ enum {
 	NLA_FLAG,
 	NLA_MSECS,
 	NLA_NESTED,
+	NLA_NESTED_ARRAY,
 	NLA_NUL_STRING,
 	NLA_BINARY,
 	NLA_S8,
@@ -200,7 +201,8 @@ enum {
  *    NLA_NUL_STRING       Maximum length of string (excluding NUL)
  *    NLA_FLAG             Unused
  *    NLA_BINARY           Maximum length of attribute payload
- *    NLA_NESTED           Length verification is done by checking len of
+ *    NLA_NESTED,
+ *    NLA_NESTED_ARRAY     Length verification is done by checking len of
  *                         nested header (or empty); len field is used if
  *                         validation_data is also used, for the max attr
  *                         number in the nested policy.
@@ -230,6 +232,12 @@ enum {
  *                         `len' to the max attribute number.
  *                         Note that nla_parse() will validate, but of course not
  *                         parse, the nested sub-policies.
+ *    NLA_NESTED_ARRAY     Points to a nested policy to validate, must also set
+ *                         `len' to the max attribute number. The difference to
+ *                         NLA_NESTED is the structure - NLA_NESTED has the
+ *                         nested attributes directly inside, while an array has
+ *                         the nested attributes at another level down and the
+ *                         attributes directly in the nesting don't matter.
  *    All other            Unused
  *
  * Example:
@@ -255,6 +263,8 @@ struct nla_policy {
 
 #define NLA_POLICY_NESTED(maxattr, policy) \
 	{ .type = NLA_NESTED, .validation_data = policy, .len = maxattr }
+#define NLA_POLICY_NESTED_ARRAY(maxattr, policy) \
+	{ .type = NLA_NESTED_ARRAY, .validation_data = policy, .len = maxattr }
 
 /**
  * struct nl_info - netlink source information
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 04750f88477c..2f8feff669a7 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -67,6 +67,34 @@ static int validate_nla_bitfield32(const struct nlattr *nla,
 	return 0;
 }
 
+static int nla_validate_array(const struct nlattr *head, int len, int maxtype,
+			      const struct nla_policy *policy,
+			      struct netlink_ext_ack *extack)
+{
+	const struct nlattr *entry;
+	int rem;
+
+	nla_for_each_attr(entry, head, len, rem) {
+		int ret;
+
+		if (nla_len(entry) == 0)
+			continue;
+
+		if (nla_len(entry) < NLA_HDRLEN) {
+			NL_SET_ERR_MSG_ATTR(extack, entry,
+					    "Array element too short");
+			return -ERANGE;
+		}
+
+		ret = nla_validate(nla_data(entry), nla_len(entry),
+				   maxtype, policy, extack);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
 static int validate_nla(const struct nlattr *nla, int maxtype,
 			const struct nla_policy *policy,
 			struct netlink_ext_ack *extack)
@@ -169,6 +197,29 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 			}
 		}
 		break;
+	case NLA_NESTED_ARRAY:
+		/* a nested array attribute is allowed to be empty; if its not,
+		 * it must have a size of at least NLA_HDRLEN.
+		 */
+		if (attrlen == 0)
+			break;
+		if (attrlen < NLA_HDRLEN)
+			goto out_err;
+		if (pt->validation_data) {
+			int err;
+
+			err = nla_validate_array(nla_data(nla), nla_len(nla),
+						 pt->len, pt->validation_data,
+						 extack);
+			if (err < 0) {
+				/*
+				 * return directly to preserve the inner
+				 * error message/attribute pointer
+				 */
+				return err;
+			}
+		}
+		break;
 	default:
 		if (pt->len)
 			minlen = pt->len;
-- 
cgit v1.2.3


From 30d65e0804d58a03d1a8ea4e12c6fc07ed08218b Mon Sep 17 00:00:00 2001
From: Matias Karhumaa <matias.karhumaa@gmail.com>
Date: Fri, 28 Sep 2018 21:54:30 +0300
Subject: Bluetooth: Fix debugfs NULL pointer dereference

Fix crash caused by NULL pointer dereference when debugfs functions
le_max_key_read, le_max_key_size_write, le_min_key_size_read or
le_min_key_size_write and Bluetooth adapter was powered off.

Fix is to move max_key_size and min_key_size from smp_dev to hci_dev.
At the same time they were renamed to le_max_key_size and
le_min_key_size.

BUG: unable to handle kernel NULL pointer dereference at 00000000000002e8
PGD 0 P4D 0
Oops: 0000 [#24] SMP PTI
CPU: 2 PID: 6255 Comm: cat Tainted: G      D    OE     4.18.9-200.fc28.x86_64 #1
Hardware name: LENOVO 4286CTO/4286CTO, BIOS 8DET76WW (1.46 ) 06/21/2018
RIP: 0010:le_max_key_size_read+0x45/0xb0 [bluetooth]
Code: 00 00 00 48 83 ec 10 65 48 8b 04 25 28 00 00 00 48 89 44 24 08 31 c0 48 8b 87 c8 00 00 00 48 8d 7c 24 04 48 8b 80 48 0a 00 00 <48> 8b 80 e8 02 00 00 0f b6 48 52 e8 fb b6 b3 ed be 04 00 00 00 48
RSP: 0018:ffffab23c3ff3df0 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 00007f0b4ca2e000 RCX: ffffab23c3ff3f08
RDX: ffffffffc0ddb033 RSI: 0000000000000004 RDI: ffffab23c3ff3df4
RBP: 0000000000020000 R08: 0000000000000000 R09: 0000000000000000
R10: ffffab23c3ff3ed8 R11: 0000000000000000 R12: ffffab23c3ff3f08
R13: 00007f0b4ca2e000 R14: 0000000000020000 R15: ffffab23c3ff3f08
FS:  00007f0b4ca0f540(0000) GS:ffff91bd5e280000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000000002e8 CR3: 00000000629fa006 CR4: 00000000000606e0
Call Trace:
 full_proxy_read+0x53/0x80
 __vfs_read+0x36/0x180
 vfs_read+0x8a/0x140
 ksys_read+0x4f/0xb0
 do_syscall_64+0x5b/0x160
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: Matias Karhumaa <matias.karhumaa@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  2 ++
 net/bluetooth/hci_core.c         |  2 ++
 net/bluetooth/smp.c              | 23 +++++++++--------------
 3 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 9b0f821b2d3a..e5ea633ea368 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -267,6 +267,8 @@ struct hci_dev {
 	__u16		le_max_tx_time;
 	__u16		le_max_rx_len;
 	__u16		le_max_rx_time;
+	__u8		le_max_key_size;
+	__u8		le_min_key_size;
 	__u16		discov_interleaved_timeout;
 	__u16		conn_info_min_age;
 	__u16		conn_info_max_age;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 0f1a8820d75c..7352fe85674b 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3147,6 +3147,8 @@ struct hci_dev *hci_alloc_dev(void)
 	hdev->le_max_tx_time = 0x0148;
 	hdev->le_max_rx_len = 0x001b;
 	hdev->le_max_rx_time = 0x0148;
+	hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE;
+	hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE;
 	hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
 	hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
 
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 3a7b0773536b..090670fe385f 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -88,9 +88,6 @@ struct smp_dev {
 	u8			local_rand[16];
 	bool			debug_key;
 
-	u8			min_key_size;
-	u8			max_key_size;
-
 	struct crypto_cipher	*tfm_aes;
 	struct crypto_shash	*tfm_cmac;
 	struct crypto_kpp	*tfm_ecdh;
@@ -720,7 +717,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn,
 	if (rsp == NULL) {
 		req->io_capability = conn->hcon->io_capability;
 		req->oob_flag = oob_flag;
-		req->max_key_size = SMP_DEV(hdev)->max_key_size;
+		req->max_key_size = hdev->le_max_key_size;
 		req->init_key_dist = local_dist;
 		req->resp_key_dist = remote_dist;
 		req->auth_req = (authreq & AUTH_REQ_MASK(hdev));
@@ -731,7 +728,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn,
 
 	rsp->io_capability = conn->hcon->io_capability;
 	rsp->oob_flag = oob_flag;
-	rsp->max_key_size = SMP_DEV(hdev)->max_key_size;
+	rsp->max_key_size = hdev->le_max_key_size;
 	rsp->init_key_dist = req->init_key_dist & remote_dist;
 	rsp->resp_key_dist = req->resp_key_dist & local_dist;
 	rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev));
@@ -745,7 +742,7 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size)
 	struct hci_dev *hdev = conn->hcon->hdev;
 	struct smp_chan *smp = chan->data;
 
-	if (max_key_size > SMP_DEV(hdev)->max_key_size ||
+	if (max_key_size > hdev->le_max_key_size ||
 	    max_key_size < SMP_MIN_ENC_KEY_SIZE)
 		return SMP_ENC_KEY_SIZE;
 
@@ -3243,8 +3240,6 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
 	smp->tfm_aes = tfm_aes;
 	smp->tfm_cmac = tfm_cmac;
 	smp->tfm_ecdh = tfm_ecdh;
-	smp->min_key_size = SMP_MIN_ENC_KEY_SIZE;
-	smp->max_key_size = SMP_MAX_ENC_KEY_SIZE;
 
 create_chan:
 	chan = l2cap_chan_create();
@@ -3370,7 +3365,7 @@ static ssize_t le_min_key_size_read(struct file *file,
 	struct hci_dev *hdev = file->private_data;
 	char buf[4];
 
-	snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->min_key_size);
+	snprintf(buf, sizeof(buf), "%2u\n", hdev->le_min_key_size);
 
 	return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
 }
@@ -3391,11 +3386,11 @@ static ssize_t le_min_key_size_write(struct file *file,
 
 	sscanf(buf, "%hhu", &key_size);
 
-	if (key_size > SMP_DEV(hdev)->max_key_size ||
+	if (key_size > hdev->le_max_key_size ||
 	    key_size < SMP_MIN_ENC_KEY_SIZE)
 		return -EINVAL;
 
-	SMP_DEV(hdev)->min_key_size = key_size;
+	hdev->le_min_key_size = key_size;
 
 	return count;
 }
@@ -3414,7 +3409,7 @@ static ssize_t le_max_key_size_read(struct file *file,
 	struct hci_dev *hdev = file->private_data;
 	char buf[4];
 
-	snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->max_key_size);
+	snprintf(buf, sizeof(buf), "%2u\n", hdev->le_max_key_size);
 
 	return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
 }
@@ -3436,10 +3431,10 @@ static ssize_t le_max_key_size_write(struct file *file,
 	sscanf(buf, "%hhu", &key_size);
 
 	if (key_size > SMP_MAX_ENC_KEY_SIZE ||
-	    key_size < SMP_DEV(hdev)->min_key_size)
+	    key_size < hdev->le_min_key_size)
 		return -EINVAL;
 
-	SMP_DEV(hdev)->max_key_size = key_size;
+	hdev->le_max_key_size = key_size;
 
 	return count;
 }
-- 
cgit v1.2.3


From 80ece6a03aaf3f3215475826bdd2bb9f326bccfd Mon Sep 17 00:00:00 2001
From: Vakul Garg <vakul.garg@nxp.com>
Date: Wed, 26 Sep 2018 16:22:08 +0530
Subject: tls: Remove redundant vars from tls record structure

Structure 'tls_rec' contains sg_aead_in and sg_aead_out which point
to a aad_space and then chain scatterlists sg_plaintext_data,
sg_encrypted_data respectively. Rather than using chained scatterlists
for plaintext and encrypted data in aead_req, it is efficient to store
aad_space in sg_encrypted_data and sg_plaintext_data itself in the
first index and get rid of sg_aead_in, sg_aead_in and further chaining.

This requires increasing size of sg_encrypted_data & sg_plaintext_data
arrarys by 1 to accommodate entry for aad_space. The code which uses
sg_encrypted_data and sg_plaintext_data has been modified to skip first
index as it points to aad_space.

Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h |  6 ++--
 net/tls/tls_sw.c  | 92 ++++++++++++++++++++++++++-----------------------------
 2 files changed, 45 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 1615fb5ea114..262420cdad10 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -101,13 +101,11 @@ struct tls_rec {
 	struct list_head list;
 	int tx_ready;
 	int tx_flags;
-	struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS];
-	struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS];
 
 	/* AAD | sg_plaintext_data | sg_tag */
-	struct scatterlist sg_aead_in[2];
+	struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS + 1];
 	/* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */
-	struct scatterlist sg_aead_out[2];
+	struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS + 1];
 
 	unsigned int sg_plaintext_size;
 	unsigned int sg_encrypted_size;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 2176678a5b69..1d4c354d5516 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -248,7 +248,7 @@ static void trim_both_sgl(struct sock *sk, int target_size)
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct tls_rec *rec = ctx->open_rec;
 
-	trim_sg(sk, rec->sg_plaintext_data,
+	trim_sg(sk, &rec->sg_plaintext_data[1],
 		&rec->sg_plaintext_num_elem,
 		&rec->sg_plaintext_size,
 		target_size);
@@ -256,7 +256,7 @@ static void trim_both_sgl(struct sock *sk, int target_size)
 	if (target_size > 0)
 		target_size += tls_ctx->tx.overhead_size;
 
-	trim_sg(sk, rec->sg_encrypted_data,
+	trim_sg(sk, &rec->sg_encrypted_data[1],
 		&rec->sg_encrypted_num_elem,
 		&rec->sg_encrypted_size,
 		target_size);
@@ -270,12 +270,13 @@ static int alloc_encrypted_sg(struct sock *sk, int len)
 	int rc = 0;
 
 	rc = sk_alloc_sg(sk, len,
-			 rec->sg_encrypted_data, 0,
+			 &rec->sg_encrypted_data[1], 0,
 			 &rec->sg_encrypted_num_elem,
 			 &rec->sg_encrypted_size, 0);
 
 	if (rc == -ENOSPC)
-		rec->sg_encrypted_num_elem = ARRAY_SIZE(rec->sg_encrypted_data);
+		rec->sg_encrypted_num_elem =
+			ARRAY_SIZE(rec->sg_encrypted_data) - 1;
 
 	return rc;
 }
@@ -287,12 +288,15 @@ static int alloc_plaintext_sg(struct sock *sk, int len)
 	struct tls_rec *rec = ctx->open_rec;
 	int rc = 0;
 
-	rc = sk_alloc_sg(sk, len, rec->sg_plaintext_data, 0,
-			 &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size,
+	rc = sk_alloc_sg(sk, len,
+			 &rec->sg_plaintext_data[1], 0,
+			 &rec->sg_plaintext_num_elem,
+			 &rec->sg_plaintext_size,
 			 tls_ctx->pending_open_record_frags);
 
 	if (rc == -ENOSPC)
-		rec->sg_plaintext_num_elem = ARRAY_SIZE(rec->sg_plaintext_data);
+		rec->sg_plaintext_num_elem =
+			ARRAY_SIZE(rec->sg_plaintext_data) - 1;
 
 	return rc;
 }
@@ -320,11 +324,11 @@ static void tls_free_open_rec(struct sock *sk)
 	if (!rec)
 		return;
 
-	free_sg(sk, rec->sg_encrypted_data,
+	free_sg(sk, &rec->sg_encrypted_data[1],
 		&rec->sg_encrypted_num_elem,
 		&rec->sg_encrypted_size);
 
-	free_sg(sk, rec->sg_plaintext_data,
+	free_sg(sk, &rec->sg_plaintext_data[1],
 		&rec->sg_plaintext_num_elem,
 		&rec->sg_plaintext_size);
 
@@ -355,7 +359,7 @@ int tls_tx_records(struct sock *sk, int flags)
 		 * Remove the head of tx_list
 		 */
 		list_del(&rec->list);
-		free_sg(sk, rec->sg_plaintext_data,
+		free_sg(sk, &rec->sg_plaintext_data[1],
 			&rec->sg_plaintext_num_elem, &rec->sg_plaintext_size);
 
 		kfree(rec);
@@ -370,13 +374,13 @@ int tls_tx_records(struct sock *sk, int flags)
 				tx_flags = flags;
 
 			rc = tls_push_sg(sk, tls_ctx,
-					 &rec->sg_encrypted_data[0],
+					 &rec->sg_encrypted_data[1],
 					 0, tx_flags);
 			if (rc)
 				goto tx_err;
 
 			list_del(&rec->list);
-			free_sg(sk, rec->sg_plaintext_data,
+			free_sg(sk, &rec->sg_plaintext_data[1],
 				&rec->sg_plaintext_num_elem,
 				&rec->sg_plaintext_size);
 
@@ -405,16 +409,12 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
 
 	rec = container_of(aead_req, struct tls_rec, aead_req);
 
-	rec->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size;
-	rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size;
+	rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size;
+	rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size;
 
 
-	/* Free the record if error is previously set on socket */
+	/* Check if error is previously set on socket */
 	if (err || sk->sk_err) {
-		free_sg(sk, rec->sg_encrypted_data,
-			&rec->sg_encrypted_num_elem, &rec->sg_encrypted_size);
-
-		kfree(rec);
 		rec = NULL;
 
 		/* If err is already set on socket, return the same code */
@@ -449,7 +449,7 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
 
 	/* Schedule the transmission */
 	if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
-		schedule_delayed_work(&ctx->tx_work.work, 1);
+		schedule_delayed_work(&ctx->tx_work.work, 2);
 }
 
 static int tls_do_encryption(struct sock *sk,
@@ -461,13 +461,14 @@ static int tls_do_encryption(struct sock *sk,
 	struct tls_rec *rec = ctx->open_rec;
 	int rc;
 
-	rec->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size;
-	rec->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size;
+	/* Skip the first index as it contains AAD data */
+	rec->sg_encrypted_data[1].offset += tls_ctx->tx.prepend_size;
+	rec->sg_encrypted_data[1].length -= tls_ctx->tx.prepend_size;
 
 	aead_request_set_tfm(aead_req, ctx->aead_send);
 	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
-	aead_request_set_crypt(aead_req, rec->sg_aead_in,
-			       rec->sg_aead_out,
+	aead_request_set_crypt(aead_req, rec->sg_plaintext_data,
+			       rec->sg_encrypted_data,
 			       data_len, tls_ctx->tx.iv);
 
 	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
@@ -480,8 +481,8 @@ static int tls_do_encryption(struct sock *sk,
 	rc = crypto_aead_encrypt(aead_req);
 	if (!rc || rc != -EINPROGRESS) {
 		atomic_dec(&ctx->encrypt_pending);
-		rec->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size;
-		rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size;
+		rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size;
+		rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size;
 	}
 
 	if (!rc) {
@@ -512,16 +513,16 @@ static int tls_push_record(struct sock *sk, int flags,
 	rec->tx_flags = flags;
 	req = &rec->aead_req;
 
-	sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem - 1);
-	sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem - 1);
+	sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem);
+	sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem);
 
 	tls_make_aad(rec->aad_space, rec->sg_plaintext_size,
 		     tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size,
 		     record_type);
 
 	tls_fill_prepend(tls_ctx,
-			 page_address(sg_page(&rec->sg_encrypted_data[0])) +
-			 rec->sg_encrypted_data[0].offset,
+			 page_address(sg_page(&rec->sg_encrypted_data[1])) +
+			 rec->sg_encrypted_data[1].offset,
 			 rec->sg_plaintext_size, record_type);
 
 	tls_ctx->pending_open_record_frags = 0;
@@ -613,7 +614,7 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct tls_rec *rec = ctx->open_rec;
-	struct scatterlist *sg = rec->sg_plaintext_data;
+	struct scatterlist *sg = &rec->sg_plaintext_data[1];
 	int copy, i, rc = 0;
 
 	for (i = tls_ctx->pending_open_record_frags;
@@ -659,17 +660,10 @@ static struct tls_rec *get_rec(struct sock *sk)
 	sg_init_table(&rec->sg_encrypted_data[0],
 		      ARRAY_SIZE(rec->sg_encrypted_data));
 
-	sg_init_table(rec->sg_aead_in, 2);
-	sg_set_buf(&rec->sg_aead_in[0], rec->aad_space,
+	sg_set_buf(&rec->sg_plaintext_data[0], rec->aad_space,
 		   sizeof(rec->aad_space));
-	sg_unmark_end(&rec->sg_aead_in[1]);
-	sg_chain(rec->sg_aead_in, 2, rec->sg_plaintext_data);
-
-	sg_init_table(rec->sg_aead_out, 2);
-	sg_set_buf(&rec->sg_aead_out[0], rec->aad_space,
+	sg_set_buf(&rec->sg_encrypted_data[0], rec->aad_space,
 		   sizeof(rec->aad_space));
-	sg_unmark_end(&rec->sg_aead_out[1]);
-	sg_chain(rec->sg_aead_out, 2, rec->sg_encrypted_data);
 
 	ctx->open_rec = rec;
 
@@ -763,8 +757,8 @@ alloc_encrypted:
 			ret = zerocopy_from_iter(sk, &msg->msg_iter,
 				try_to_copy, &rec->sg_plaintext_num_elem,
 				&rec->sg_plaintext_size,
-				rec->sg_plaintext_data,
-				ARRAY_SIZE(rec->sg_plaintext_data),
+				&rec->sg_plaintext_data[1],
+				ARRAY_SIZE(rec->sg_plaintext_data) - 1,
 				true);
 			if (ret)
 				goto fallback_to_reg_send;
@@ -781,7 +775,7 @@ alloc_encrypted:
 			continue;
 
 fallback_to_reg_send:
-			trim_sg(sk, rec->sg_plaintext_data,
+			trim_sg(sk, &rec->sg_plaintext_data[1],
 				&rec->sg_plaintext_num_elem,
 				&rec->sg_plaintext_size,
 				orig_size);
@@ -801,7 +795,7 @@ alloc_plaintext:
 			try_to_copy -= required_size - rec->sg_plaintext_size;
 			full_record = true;
 
-			trim_sg(sk, rec->sg_encrypted_data,
+			trim_sg(sk, &rec->sg_encrypted_data[1],
 				&rec->sg_encrypted_num_elem,
 				&rec->sg_encrypted_size,
 				rec->sg_plaintext_size +
@@ -949,7 +943,7 @@ alloc_payload:
 		}
 
 		get_page(page);
-		sg = rec->sg_plaintext_data + rec->sg_plaintext_num_elem;
+		sg = &rec->sg_plaintext_data[1] + rec->sg_plaintext_num_elem;
 		sg_set_page(sg, page, copy, offset);
 		sg_unmark_end(sg);
 
@@ -963,7 +957,7 @@ alloc_payload:
 
 		if (full_record || eor ||
 		    rec->sg_plaintext_num_elem ==
-		    ARRAY_SIZE(rec->sg_plaintext_data)) {
+		    ARRAY_SIZE(rec->sg_plaintext_data) - 1) {
 			ret = tls_push_record(sk, flags, record_type);
 			if (ret) {
 				if (ret == -EINPROGRESS)
@@ -1571,7 +1565,7 @@ void tls_sw_free_resources_tx(struct sock *sk)
 		rec = list_first_entry(&ctx->tx_list,
 				       struct tls_rec, list);
 
-		free_sg(sk, rec->sg_plaintext_data,
+		free_sg(sk, &rec->sg_plaintext_data[1],
 			&rec->sg_plaintext_num_elem,
 			&rec->sg_plaintext_size);
 
@@ -1580,11 +1574,11 @@ void tls_sw_free_resources_tx(struct sock *sk)
 	}
 
 	list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) {
-		free_sg(sk, rec->sg_encrypted_data,
+		free_sg(sk, &rec->sg_encrypted_data[1],
 			&rec->sg_encrypted_num_elem,
 			&rec->sg_encrypted_size);
 
-		free_sg(sk, rec->sg_plaintext_data,
+		free_sg(sk, &rec->sg_plaintext_data[1],
 			&rec->sg_plaintext_num_elem,
 			&rec->sg_plaintext_size);
 
-- 
cgit v1.2.3


From 8bad74f9840f87661f20ced3dc80c84ab4fd55a1 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:36 +0000
Subject: bpf: extend cgroup bpf core to allow multiple cgroup storage types

In order to introduce per-cpu cgroup storage, let's generalize
bpf cgroup core to support multiple cgroup storage types.
Potentially, per-node cgroup storage can be added later.

This commit is mostly a formal change that replaces
cgroup_storage pointer with a array of cgroup_storage pointers.
It doesn't actually introduce a new storage type,
it will be done later.

Each bpf program is now able to have one cgroup storage of each type.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 38 +++++++++++++++++-------
 include/linux/bpf.h        | 11 +++++--
 kernel/bpf/cgroup.c        | 74 ++++++++++++++++++++++++++++++++--------------
 kernel/bpf/helpers.c       | 15 ++++++----
 kernel/bpf/local_storage.c | 18 ++++++-----
 kernel/bpf/syscall.c       |  9 ++++--
 kernel/bpf/verifier.c      |  8 +++--
 net/bpf/test_run.c         | 20 +++++++++----
 8 files changed, 136 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index f91b0f8ff3a9..e9871b012dac 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -2,6 +2,7 @@
 #ifndef _BPF_CGROUP_H
 #define _BPF_CGROUP_H
 
+#include <linux/bpf.h>
 #include <linux/errno.h>
 #include <linux/jump_label.h>
 #include <linux/percpu.h>
@@ -22,7 +23,10 @@ struct bpf_cgroup_storage;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
-DECLARE_PER_CPU(void*, bpf_cgroup_storage);
+DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+
+#define for_each_cgroup_storage_type(stype) \
+	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
 
 struct bpf_cgroup_storage_map;
 
@@ -43,7 +47,7 @@ struct bpf_cgroup_storage {
 struct bpf_prog_list {
 	struct list_head node;
 	struct bpf_prog *prog;
-	struct bpf_cgroup_storage *storage;
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 };
 
 struct bpf_prog_array;
@@ -101,18 +105,29 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 				      short access, enum bpf_attach_type type);
 
-static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage)
+static inline enum bpf_cgroup_storage_type cgroup_storage_type(
+	struct bpf_map *map)
 {
+	return BPF_CGROUP_STORAGE_SHARED;
+}
+
+static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
+					  *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
+{
+	enum bpf_cgroup_storage_type stype;
 	struct bpf_storage_buffer *buf;
 
-	if (!storage)
-		return;
+	for_each_cgroup_storage_type(stype) {
+		if (!storage[stype])
+			continue;
 
-	buf = READ_ONCE(storage->buf);
-	this_cpu_write(bpf_cgroup_storage, &buf->data[0]);
+		buf = READ_ONCE(storage[stype]->buf);
+		this_cpu_write(bpf_cgroup_storage[stype], &buf->data[0]);
+	}
 }
 
-struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog);
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
+					enum bpf_cgroup_storage_type stype);
 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
 			     struct cgroup *cgroup,
@@ -265,13 +280,14 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
-static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {}
+static inline void bpf_cgroup_storage_set(
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
 static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
 					    struct bpf_map *map) { return 0; }
 static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
 					      struct bpf_map *map) {}
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
-	struct bpf_prog *prog) { return 0; }
+	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
 static inline void bpf_cgroup_storage_free(
 	struct bpf_cgroup_storage *storage) {}
 
@@ -293,6 +309,8 @@ static inline void bpf_cgroup_storage_free(
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
+#define for_each_cgroup_storage_type(stype) for (; false; )
+
 #endif /* CONFIG_CGROUP_BPF */
 
 #endif /* _BPF_CGROUP_H */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 988a00797bcd..b457fbe7b70b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -272,6 +272,13 @@ struct bpf_prog_offload {
 	u32			jited_len;
 };
 
+enum bpf_cgroup_storage_type {
+	BPF_CGROUP_STORAGE_SHARED,
+	__BPF_CGROUP_STORAGE_MAX
+};
+
+#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -289,7 +296,7 @@ struct bpf_prog_aux {
 	struct bpf_prog *prog;
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
-	struct bpf_map *cgroup_storage;
+	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
 #ifdef CONFIG_SECURITY
 	void *security;
@@ -358,7 +365,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
  */
 struct bpf_prog_array_item {
 	struct bpf_prog *prog;
-	struct bpf_cgroup_storage *cgroup_storage;
+	struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 };
 
 struct bpf_prog_array {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 549f6fbcc461..00f6ed2e4f9a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -25,6 +25,7 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
  */
 void cgroup_bpf_put(struct cgroup *cgrp)
 {
+	enum bpf_cgroup_storage_type stype;
 	unsigned int type;
 
 	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
@@ -34,8 +35,10 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 		list_for_each_entry_safe(pl, tmp, progs, node) {
 			list_del(&pl->node);
 			bpf_prog_put(pl->prog);
-			bpf_cgroup_storage_unlink(pl->storage);
-			bpf_cgroup_storage_free(pl->storage);
+			for_each_cgroup_storage_type(stype) {
+				bpf_cgroup_storage_unlink(pl->storage[stype]);
+				bpf_cgroup_storage_free(pl->storage[stype]);
+			}
 			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
@@ -97,6 +100,7 @@ static int compute_effective_progs(struct cgroup *cgrp,
 				   enum bpf_attach_type type,
 				   struct bpf_prog_array __rcu **array)
 {
+	enum bpf_cgroup_storage_type stype;
 	struct bpf_prog_array *progs;
 	struct bpf_prog_list *pl;
 	struct cgroup *p = cgrp;
@@ -125,7 +129,9 @@ static int compute_effective_progs(struct cgroup *cgrp,
 				continue;
 
 			progs->items[cnt].prog = pl->prog;
-			progs->items[cnt].cgroup_storage = pl->storage;
+			for_each_cgroup_storage_type(stype)
+				progs->items[cnt].cgroup_storage[stype] =
+					pl->storage[stype];
 			cnt++;
 		}
 	} while ((p = cgroup_parent(p)));
@@ -232,7 +238,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 {
 	struct list_head *progs = &cgrp->bpf.progs[type];
 	struct bpf_prog *old_prog = NULL;
-	struct bpf_cgroup_storage *storage, *old_storage = NULL;
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
+		*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
+	enum bpf_cgroup_storage_type stype;
 	struct bpf_prog_list *pl;
 	bool pl_was_allocated;
 	int err;
@@ -254,34 +262,44 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 		return -E2BIG;
 
-	storage = bpf_cgroup_storage_alloc(prog);
-	if (IS_ERR(storage))
-		return -ENOMEM;
+	for_each_cgroup_storage_type(stype) {
+		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
+		if (IS_ERR(storage[stype])) {
+			storage[stype] = NULL;
+			for_each_cgroup_storage_type(stype)
+				bpf_cgroup_storage_free(storage[stype]);
+			return -ENOMEM;
+		}
+	}
 
 	if (flags & BPF_F_ALLOW_MULTI) {
 		list_for_each_entry(pl, progs, node) {
 			if (pl->prog == prog) {
 				/* disallow attaching the same prog twice */
-				bpf_cgroup_storage_free(storage);
+				for_each_cgroup_storage_type(stype)
+					bpf_cgroup_storage_free(storage[stype]);
 				return -EINVAL;
 			}
 		}
 
 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 		if (!pl) {
-			bpf_cgroup_storage_free(storage);
+			for_each_cgroup_storage_type(stype)
+				bpf_cgroup_storage_free(storage[stype]);
 			return -ENOMEM;
 		}
 
 		pl_was_allocated = true;
 		pl->prog = prog;
-		pl->storage = storage;
+		for_each_cgroup_storage_type(stype)
+			pl->storage[stype] = storage[stype];
 		list_add_tail(&pl->node, progs);
 	} else {
 		if (list_empty(progs)) {
 			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 			if (!pl) {
-				bpf_cgroup_storage_free(storage);
+				for_each_cgroup_storage_type(stype)
+					bpf_cgroup_storage_free(storage[stype]);
 				return -ENOMEM;
 			}
 			pl_was_allocated = true;
@@ -289,12 +307,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		} else {
 			pl = list_first_entry(progs, typeof(*pl), node);
 			old_prog = pl->prog;
-			old_storage = pl->storage;
-			bpf_cgroup_storage_unlink(old_storage);
+			for_each_cgroup_storage_type(stype) {
+				old_storage[stype] = pl->storage[stype];
+				bpf_cgroup_storage_unlink(old_storage[stype]);
+			}
 			pl_was_allocated = false;
 		}
 		pl->prog = prog;
-		pl->storage = storage;
+		for_each_cgroup_storage_type(stype)
+			pl->storage[stype] = storage[stype];
 	}
 
 	cgrp->bpf.flags[type] = flags;
@@ -304,21 +325,27 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		goto cleanup;
 
 	static_branch_inc(&cgroup_bpf_enabled_key);
-	if (old_storage)
-		bpf_cgroup_storage_free(old_storage);
+	for_each_cgroup_storage_type(stype) {
+		if (!old_storage[stype])
+			continue;
+		bpf_cgroup_storage_free(old_storage[stype]);
+	}
 	if (old_prog) {
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
-	bpf_cgroup_storage_link(storage, cgrp, type);
+	for_each_cgroup_storage_type(stype)
+		bpf_cgroup_storage_link(storage[stype], cgrp, type);
 	return 0;
 
 cleanup:
 	/* and cleanup the prog list */
 	pl->prog = old_prog;
-	bpf_cgroup_storage_free(pl->storage);
-	pl->storage = old_storage;
-	bpf_cgroup_storage_link(old_storage, cgrp, type);
+	for_each_cgroup_storage_type(stype) {
+		bpf_cgroup_storage_free(pl->storage[stype]);
+		pl->storage[stype] = old_storage[stype];
+		bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
+	}
 	if (pl_was_allocated) {
 		list_del(&pl->node);
 		kfree(pl);
@@ -339,6 +366,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 unused_flags)
 {
 	struct list_head *progs = &cgrp->bpf.progs[type];
+	enum bpf_cgroup_storage_type stype;
 	u32 flags = cgrp->bpf.flags[type];
 	struct bpf_prog *old_prog = NULL;
 	struct bpf_prog_list *pl;
@@ -385,8 +413,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 
 	/* now can actually delete it from this cgroup list */
 	list_del(&pl->node);
-	bpf_cgroup_storage_unlink(pl->storage);
-	bpf_cgroup_storage_free(pl->storage);
+	for_each_cgroup_storage_type(stype) {
+		bpf_cgroup_storage_unlink(pl->storage[stype]);
+		bpf_cgroup_storage_free(pl->storage[stype]);
+	}
 	kfree(pl);
 	if (list_empty(progs))
 		/* last program was detached, reset flags to zero */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1991466b8327..9070b2ace6aa 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -194,16 +194,18 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-DECLARE_PER_CPU(void*, bpf_cgroup_storage);
+#ifdef CONFIG_CGROUP_BPF
+DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 {
-	/* map and flags arguments are not used now,
-	 * but provide an ability to extend the API
-	 * for other types of local storages.
-	 * verifier checks that their values are correct.
+	/* flags argument is not used now,
+	 * but provides an ability to extend the API.
+	 * verifier checks that its value is correct.
 	 */
-	return (unsigned long) this_cpu_read(bpf_cgroup_storage);
+	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+
+	return (unsigned long) this_cpu_read(bpf_cgroup_storage[stype]);
 }
 
 const struct bpf_func_proto bpf_get_local_storage_proto = {
@@ -214,3 +216,4 @@ const struct bpf_func_proto bpf_get_local_storage_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 #endif
+#endif
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 22ad967d1e5f..0bd9f19fc557 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -7,7 +7,7 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 
-DEFINE_PER_CPU(void*, bpf_cgroup_storage);
+DEFINE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #ifdef CONFIG_CGROUP_BPF
 
@@ -251,6 +251,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
 
 int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
 {
+	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
 	int ret = -EBUSY;
 
@@ -258,11 +259,12 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
 
 	if (map->prog && map->prog != prog)
 		goto unlock;
-	if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map)
+	if (prog->aux->cgroup_storage[stype] &&
+	    prog->aux->cgroup_storage[stype] != _map)
 		goto unlock;
 
 	map->prog = prog;
-	prog->aux->cgroup_storage = _map;
+	prog->aux->cgroup_storage[stype] = _map;
 	ret = 0;
 unlock:
 	spin_unlock_bh(&map->lock);
@@ -272,24 +274,26 @@ unlock:
 
 void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
 {
+	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
 
 	spin_lock_bh(&map->lock);
 	if (map->prog == prog) {
-		WARN_ON(prog->aux->cgroup_storage != _map);
+		WARN_ON(prog->aux->cgroup_storage[stype] != _map);
 		map->prog = NULL;
-		prog->aux->cgroup_storage = NULL;
+		prog->aux->cgroup_storage[stype] = NULL;
 	}
 	spin_unlock_bh(&map->lock);
 }
 
-struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog)
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
+					enum bpf_cgroup_storage_type stype)
 {
 	struct bpf_cgroup_storage *storage;
 	struct bpf_map *map;
 	u32 pages;
 
-	map = prog->aux->cgroup_storage;
+	map = prog->aux->cgroup_storage[stype];
 	if (!map)
 		return NULL;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b3c2d09bcf7a..8c91d2b41b1e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -988,10 +988,15 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 /* drop refcnt on maps used by eBPF program and free auxilary data */
 static void free_used_maps(struct bpf_prog_aux *aux)
 {
+	enum bpf_cgroup_storage_type stype;
 	int i;
 
-	if (aux->cgroup_storage)
-		bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage);
+	for_each_cgroup_storage_type(stype) {
+		if (!aux->cgroup_storage[stype])
+			continue;
+		bpf_cgroup_storage_release(aux->prog,
+					   aux->cgroup_storage[stype]);
+	}
 
 	for (i = 0; i < aux->used_map_cnt; i++)
 		bpf_map_put(aux->used_maps[i]);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e986518d7bc3..e90899df585d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5171,11 +5171,15 @@ next_insn:
 /* drop refcnt of maps used by the rejected program */
 static void release_maps(struct bpf_verifier_env *env)
 {
+	enum bpf_cgroup_storage_type stype;
 	int i;
 
-	if (env->prog->aux->cgroup_storage)
+	for_each_cgroup_storage_type(stype) {
+		if (!env->prog->aux->cgroup_storage[stype])
+			continue;
 		bpf_cgroup_storage_release(env->prog,
-					   env->prog->aux->cgroup_storage);
+			env->prog->aux->cgroup_storage[stype]);
+	}
 
 	for (i = 0; i < env->used_map_cnt; i++)
 		bpf_map_put(env->used_maps[i]);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index f4078830ea50..0c423b8cd75c 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -12,7 +12,7 @@
 #include <linux/sched/signal.h>
 
 static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
-					    struct bpf_cgroup_storage *storage)
+		struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
 {
 	u32 ret;
 
@@ -28,13 +28,20 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
 
 static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
 {
-	struct bpf_cgroup_storage *storage = NULL;
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 };
+	enum bpf_cgroup_storage_type stype;
 	u64 time_start, time_spent = 0;
 	u32 ret = 0, i;
 
-	storage = bpf_cgroup_storage_alloc(prog);
-	if (IS_ERR(storage))
-		return PTR_ERR(storage);
+	for_each_cgroup_storage_type(stype) {
+		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
+		if (IS_ERR(storage[stype])) {
+			storage[stype] = NULL;
+			for_each_cgroup_storage_type(stype)
+				bpf_cgroup_storage_free(storage[stype]);
+			return -ENOMEM;
+		}
+	}
 
 	if (!repeat)
 		repeat = 1;
@@ -53,7 +60,8 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
 	do_div(time_spent, repeat);
 	*time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
 
-	bpf_cgroup_storage_free(storage);
+	for_each_cgroup_storage_type(stype)
+		bpf_cgroup_storage_free(storage[stype]);
 
 	return ret;
 }
-- 
cgit v1.2.3


From f294b37ec7b24a574884cd157497a3748081c0f0 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:40 +0000
Subject: bpf: rework cgroup storage pointer passing

To simplify the following introduction of per-cpu cgroup storage,
let's rework a bit a mechanism of passing a pointer to a cgroup
storage into the bpf_get_local_storage(). Let's save a pointer
to the corresponding bpf_cgroup_storage structure, instead of
a pointer to the actual buffer.

It will help us to handle per-cpu storage later, which has
a different way of accessing to the actual data.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 13 ++++---------
 kernel/bpf/helpers.c       |  8 ++++++--
 kernel/bpf/local_storage.c |  3 ++-
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index e9871b012dac..7e0c9a1d48b7 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -23,7 +23,8 @@ struct bpf_cgroup_storage;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
-DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DECLARE_PER_CPU(struct bpf_cgroup_storage*,
+		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #define for_each_cgroup_storage_type(stype) \
 	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
@@ -115,15 +116,9 @@ static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
 					  *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
 {
 	enum bpf_cgroup_storage_type stype;
-	struct bpf_storage_buffer *buf;
-
-	for_each_cgroup_storage_type(stype) {
-		if (!storage[stype])
-			continue;
 
-		buf = READ_ONCE(storage[stype]->buf);
-		this_cpu_write(bpf_cgroup_storage[stype], &buf->data[0]);
-	}
+	for_each_cgroup_storage_type(stype)
+		this_cpu_write(bpf_cgroup_storage[stype], storage[stype]);
 }
 
 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9070b2ace6aa..e42f8789b7ea 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -195,7 +195,8 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
 };
 
 #ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DECLARE_PER_CPU(struct bpf_cgroup_storage*,
+		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 {
@@ -204,8 +205,11 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	 * verifier checks that its value is correct.
 	 */
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+	struct bpf_cgroup_storage *storage;
 
-	return (unsigned long) this_cpu_read(bpf_cgroup_storage[stype]);
+	storage = this_cpu_read(bpf_cgroup_storage[stype]);
+
+	return (unsigned long)&READ_ONCE(storage->buf)->data[0];
 }
 
 const struct bpf_func_proto bpf_get_local_storage_proto = {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 0bd9f19fc557..6742292fb39e 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -7,7 +7,8 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 
-DEFINE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DEFINE_PER_CPU(struct bpf_cgroup_storage*,
+	       bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #ifdef CONFIG_CGROUP_BPF
 
-- 
cgit v1.2.3


From b741f1630346defcbc8cc60f1a2bdae8b3b0036f Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:43 +0000
Subject: bpf: introduce per-cpu cgroup local storage

This commit introduced per-cpu cgroup local storage.

Per-cpu cgroup local storage is very similar to simple cgroup storage
(let's call it shared), except all the data is per-cpu.

The main goal of per-cpu variant is to implement super fast
counters (e.g. packet counters), which don't require neither
lookups, neither atomic operations.

>From userspace's point of view, accessing a per-cpu cgroup storage
is similar to other per-cpu map types (e.g. per-cpu hashmaps and
arrays).

Writing to a per-cpu cgroup storage is not atomic, but is performed
by copying longs, so some minimal atomicity is here, exactly
as with other per-cpu maps.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h |  20 +++++-
 include/linux/bpf.h        |   1 +
 include/linux/bpf_types.h  |   1 +
 include/uapi/linux/bpf.h   |   1 +
 kernel/bpf/helpers.c       |   8 ++-
 kernel/bpf/local_storage.c | 150 +++++++++++++++++++++++++++++++++++++++------
 kernel/bpf/syscall.c       |  11 +++-
 kernel/bpf/verifier.c      |  15 +++--
 8 files changed, 179 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 7e0c9a1d48b7..588dd5f0bd85 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -37,7 +37,10 @@ struct bpf_storage_buffer {
 };
 
 struct bpf_cgroup_storage {
-	struct bpf_storage_buffer *buf;
+	union {
+		struct bpf_storage_buffer *buf;
+		void __percpu *percpu_buf;
+	};
 	struct bpf_cgroup_storage_map *map;
 	struct bpf_cgroup_storage_key key;
 	struct list_head list;
@@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
 {
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+		return BPF_CGROUP_STORAGE_PERCPU;
+
 	return BPF_CGROUP_STORAGE_SHARED;
 }
 
@@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
 int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
 void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
 
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
+int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
+				     void *value, u64 flags);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
 	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
 static inline void bpf_cgroup_storage_free(
 	struct bpf_cgroup_storage *storage) {}
+static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
+						 void *value) {
+	return 0;
+}
+static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
+					void *key, void *value, u64 flags) {
+	return 0;
+}
 
 #define cgroup_bpf_enabled (0)
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b457fbe7b70b..018299a595c8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -274,6 +274,7 @@ struct bpf_prog_offload {
 
 enum bpf_cgroup_storage_type {
 	BPF_CGROUP_STORAGE_SHARED,
+	BPF_CGROUP_STORAGE_PERCPU,
 	__BPF_CGROUP_STORAGE_MAX
 };
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c9bd6fb765b0..5432f4c9f50e 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
 #endif
 #ifdef CONFIG_CGROUP_BPF
 BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa5ccd2385ed..e2070d819e04 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -127,6 +127,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_SOCKHASH,
 	BPF_MAP_TYPE_CGROUP_STORAGE,
 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e42f8789b7ea..6502115e8f55 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	 */
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
 	struct bpf_cgroup_storage *storage;
+	void *ptr;
 
 	storage = this_cpu_read(bpf_cgroup_storage[stype]);
 
-	return (unsigned long)&READ_ONCE(storage->buf)->data[0];
+	if (stype == BPF_CGROUP_STORAGE_SHARED)
+		ptr = &READ_ONCE(storage->buf)->data[0];
+	else
+		ptr = this_cpu_ptr(storage->percpu_buf);
+
+	return (unsigned long)ptr;
 }
 
 const struct bpf_func_proto bpf_get_local_storage_proto = {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 6742292fb39e..944eb297465f 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
 	return 0;
 }
 
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
+				   void *value)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	struct bpf_cgroup_storage_key *key = _key;
+	struct bpf_cgroup_storage *storage;
+	int cpu, off = 0;
+	u32 size;
+
+	rcu_read_lock();
+	storage = cgroup_storage_lookup(map, key, false);
+	if (!storage) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	/* per_cpu areas are zero-filled and bpf programs can only
+	 * access 'value_size' of them, so copying rounded areas
+	 * will not leak any kernel data
+	 */
+	size = round_up(_map->value_size, 8);
+	for_each_possible_cpu(cpu) {
+		bpf_long_memcpy(value + off,
+				per_cpu_ptr(storage->percpu_buf, cpu), size);
+		off += size;
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
+				     void *value, u64 map_flags)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	struct bpf_cgroup_storage_key *key = _key;
+	struct bpf_cgroup_storage *storage;
+	int cpu, off = 0;
+	u32 size;
+
+	if (map_flags != BPF_ANY && map_flags != BPF_EXIST)
+		return -EINVAL;
+
+	rcu_read_lock();
+	storage = cgroup_storage_lookup(map, key, false);
+	if (!storage) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	/* the user space will provide round_up(value_size, 8) bytes that
+	 * will be copied into per-cpu area. bpf programs can only access
+	 * value_size of it. During lookup the same extra bytes will be
+	 * returned or zeros which were zero-filled by percpu_alloc,
+	 * so no kernel data leaks possible
+	 */
+	size = round_up(_map->value_size, 8);
+	for_each_possible_cpu(cpu) {
+		bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
+				value + off, size);
+		off += size;
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
 				       void *_next_key)
 {
@@ -287,60 +352,105 @@ void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
 	spin_unlock_bh(&map->lock);
 }
 
+static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
+{
+	size_t size;
+
+	if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) {
+		size = sizeof(struct bpf_storage_buffer) + map->value_size;
+		*pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
+				  PAGE_SIZE) >> PAGE_SHIFT;
+	} else {
+		size = map->value_size;
+		*pages = round_up(round_up(size, 8) * num_possible_cpus(),
+				  PAGE_SIZE) >> PAGE_SHIFT;
+	}
+
+	return size;
+}
+
 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
 					enum bpf_cgroup_storage_type stype)
 {
 	struct bpf_cgroup_storage *storage;
 	struct bpf_map *map;
+	gfp_t flags;
+	size_t size;
 	u32 pages;
 
 	map = prog->aux->cgroup_storage[stype];
 	if (!map)
 		return NULL;
 
-	pages = round_up(sizeof(struct bpf_cgroup_storage) +
-			 sizeof(struct bpf_storage_buffer) +
-			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+	size = bpf_cgroup_storage_calculate_size(map, &pages);
+
 	if (bpf_map_charge_memlock(map, pages))
 		return ERR_PTR(-EPERM);
 
 	storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
 			       __GFP_ZERO | GFP_USER, map->numa_node);
-	if (!storage) {
-		bpf_map_uncharge_memlock(map, pages);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!storage)
+		goto enomem;
 
-	storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
-				    map->value_size, __GFP_ZERO | GFP_USER,
-				    map->numa_node);
-	if (!storage->buf) {
-		bpf_map_uncharge_memlock(map, pages);
-		kfree(storage);
-		return ERR_PTR(-ENOMEM);
+	flags = __GFP_ZERO | GFP_USER;
+
+	if (stype == BPF_CGROUP_STORAGE_SHARED) {
+		storage->buf = kmalloc_node(size, flags, map->numa_node);
+		if (!storage->buf)
+			goto enomem;
+	} else {
+		storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
+		if (!storage->percpu_buf)
+			goto enomem;
 	}
 
 	storage->map = (struct bpf_cgroup_storage_map *)map;
 
 	return storage;
+
+enomem:
+	bpf_map_uncharge_memlock(map, pages);
+	kfree(storage);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu)
+{
+	struct bpf_cgroup_storage *storage =
+		container_of(rcu, struct bpf_cgroup_storage, rcu);
+
+	kfree(storage->buf);
+	kfree(storage);
+}
+
+static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu)
+{
+	struct bpf_cgroup_storage *storage =
+		container_of(rcu, struct bpf_cgroup_storage, rcu);
+
+	free_percpu(storage->percpu_buf);
+	kfree(storage);
 }
 
 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
 {
-	u32 pages;
+	enum bpf_cgroup_storage_type stype;
 	struct bpf_map *map;
+	u32 pages;
 
 	if (!storage)
 		return;
 
 	map = &storage->map->map;
-	pages = round_up(sizeof(struct bpf_cgroup_storage) +
-			 sizeof(struct bpf_storage_buffer) +
-			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+
+	bpf_cgroup_storage_calculate_size(map, &pages);
 	bpf_map_uncharge_memlock(map, pages);
 
-	kfree_rcu(storage->buf, rcu);
-	kfree_rcu(storage, rcu);
+	stype = cgroup_storage_type(map);
+	if (stype == BPF_CGROUP_STORAGE_SHARED)
+		call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu);
+	else
+		call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu);
 }
 
 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8c91d2b41b1e..5742df21598c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
 	else if (IS_FD_MAP(map))
 		value_size = sizeof(u32);
@@ -705,6 +706,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_percpu_hash_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+		err = bpf_percpu_cgroup_storage_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
 		err = bpf_stackmap_copy(map, key, value);
 	} else if (IS_FD_ARRAY(map)) {
@@ -774,7 +777,8 @@ static int map_update_elem(union bpf_attr *attr)
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
 	else
 		value_size = map->value_size;
@@ -809,6 +813,9 @@ static int map_update_elem(union bpf_attr *attr)
 		err = bpf_percpu_hash_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_update(map, key, value, attr->flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+		err = bpf_percpu_cgroup_storage_update(map, key, value,
+						       attr->flags);
 	} else if (IS_FD_ARRAY(map)) {
 		rcu_read_lock();
 		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e90899df585d..a8cc83a970d1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2074,6 +2074,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_MAP_TYPE_CGROUP_STORAGE:
+	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
 		if (func_id != BPF_FUNC_get_local_storage)
 			goto error;
 		break;
@@ -2164,7 +2165,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_FUNC_get_local_storage:
-		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
+		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+		    map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
 			goto error;
 		break;
 	case BPF_FUNC_sk_select_reuseport:
@@ -5049,6 +5051,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
+{
+	return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
+		map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+}
+
 /* look for pseudo eBPF instructions that access map FDs and
  * replace them with actual map pointers
  */
@@ -5139,10 +5147,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			}
 			env->used_maps[env->used_map_cnt++] = map;
 
-			if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
+			if (bpf_map_is_cgroup_storage(map) &&
 			    bpf_cgroup_storage_assign(env->prog, map)) {
-				verbose(env,
-					"only one cgroup storage is allowed\n");
+				verbose(env, "only one cgroup storage of each type is allowed\n");
 				fdput(f);
 				return -EBUSY;
 			}
-- 
cgit v1.2.3


From 59c9d35ea9cd73c3a55642ec9a0097770baccb93 Mon Sep 17 00:00:00 2001
From: Alaa Hleihel <alaa@mellanox.com>
Date: Wed, 5 Sep 2018 17:06:37 +0300
Subject: net/mlx5: Cache the system image guid

The system image guid is a read-only field which is used by the TC
offloads code to determine if two mlx5 devices belong to the same
ASIC while adding flows.

Read this once and save it on the core device rather than querying each
time an offloaded flow is added.

Signed-off-by: Alaa Hleihel <alaa@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 9 +++++++++
 include/linux/mlx5/driver.h                     | 1 +
 include/linux/mlx5/vport.h                      | 2 ++
 4 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 9fed54017659..82723a0e509a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2040,8 +2040,8 @@ static bool same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv)
 	fmdev = priv->mdev;
 	pmdev = peer_priv->mdev;
 
-	mlx5_query_nic_vport_system_image_guid(fmdev, &fsystem_guid);
-	mlx5_query_nic_vport_system_image_guid(pmdev, &psystem_guid);
+	fsystem_guid = mlx5_query_nic_system_image_guid(fmdev);
+	psystem_guid = mlx5_query_nic_system_image_guid(pmdev);
 
 	return (fsystem_guid == psystem_guid);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index b02af317c125..cfbea66b4879 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -1201,3 +1201,12 @@ int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev)
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx5_nic_vport_unaffiliate_multiport);
+
+u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev)
+{
+	if (!mdev->sys_image_guid)
+		mlx5_query_nic_vport_system_image_guid(mdev, &mdev->sys_image_guid);
+
+	return mdev->sys_image_guid;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ed73b51f6697..26a92462f4ce 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -838,6 +838,7 @@ struct mlx5_core_dev {
 		u32 fpga[MLX5_ST_SZ_DW(fpga_cap)];
 		u32 qcam[MLX5_ST_SZ_DW(qcam_reg)];
 	} caps;
+	u64			sys_image_guid;
 	phys_addr_t		iseg_base;
 	struct mlx5_init_seg __iomem *iseg;
 	enum mlx5_device_state	state;
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 7e7c6dfcfb09..9c694808c212 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -121,4 +121,6 @@ int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status);
 int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev,
 				       struct mlx5_core_dev *port_mdev);
 int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev);
+
+u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev);
 #endif /* __MLX5_VPORT_H__ */
-- 
cgit v1.2.3


From b31cdffa2329fe330cd304ca26c250dd1520fb0a Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 29 Sep 2018 23:04:09 +0200
Subject: net: phy: Move linkmode helpers to somewhere public

phylink has some useful helpers to working with linkmode bitmaps.
Move them to there own header so other code can use them.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 27 -------------------
 include/linux/linkmode.h  | 67 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mii.h       |  1 +
 include/linux/phy.h       |  1 +
 4 files changed, 69 insertions(+), 27 deletions(-)
 create mode 100644 include/linux/linkmode.h

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 1d01e0c625a5..b6993af5c9e4 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -68,33 +68,6 @@ struct phylink {
 	struct sfp_bus *sfp_bus;
 };
 
-static inline void linkmode_zero(unsigned long *dst)
-{
-	bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
-}
-
-static inline void linkmode_copy(unsigned long *dst, const unsigned long *src)
-{
-	bitmap_copy(dst, src, __ETHTOOL_LINK_MODE_MASK_NBITS);
-}
-
-static inline void linkmode_and(unsigned long *dst, const unsigned long *a,
-				const unsigned long *b)
-{
-	bitmap_and(dst, a, b, __ETHTOOL_LINK_MODE_MASK_NBITS);
-}
-
-static inline void linkmode_or(unsigned long *dst, const unsigned long *a,
-				const unsigned long *b)
-{
-	bitmap_or(dst, a, b, __ETHTOOL_LINK_MODE_MASK_NBITS);
-}
-
-static inline bool linkmode_empty(const unsigned long *src)
-{
-	return bitmap_empty(src, __ETHTOOL_LINK_MODE_MASK_NBITS);
-}
-
 /**
  * phylink_set_port_modes() - set the port type modes in the ethtool mask
  * @mask: ethtool link mode mask
diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
new file mode 100644
index 000000000000..014fb86c7114
--- /dev/null
+++ b/include/linux/linkmode.h
@@ -0,0 +1,67 @@
+#ifndef __LINKMODE_H
+#define __LINKMODE_H
+
+#include <linux/bitmap.h>
+#include <linux/ethtool.h>
+#include <uapi/linux/ethtool.h>
+
+static inline void linkmode_zero(unsigned long *dst)
+{
+	bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static inline void linkmode_copy(unsigned long *dst, const unsigned long *src)
+{
+	bitmap_copy(dst, src, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static inline void linkmode_and(unsigned long *dst, const unsigned long *a,
+				const unsigned long *b)
+{
+	bitmap_and(dst, a, b, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static inline void linkmode_or(unsigned long *dst, const unsigned long *a,
+				const unsigned long *b)
+{
+	bitmap_or(dst, a, b, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static inline bool linkmode_empty(const unsigned long *src)
+{
+	return bitmap_empty(src, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static inline int linkmode_andnot(unsigned long *dst, const unsigned long *src1,
+				  const unsigned long *src2)
+{
+	return bitmap_andnot(dst, src1, src2,  __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
+{
+	__set_bit(nr, addr);
+}
+
+static inline void linkmode_clear_bit(int nr, volatile unsigned long *addr)
+{
+	__clear_bit(nr, addr);
+}
+
+static inline void linkmode_change_bit(int nr, volatile unsigned long *addr)
+{
+	__change_bit(nr, addr);
+}
+
+static inline int linkmode_test_bit(int nr, volatile unsigned long *addr)
+{
+	return test_bit(nr, addr);
+}
+
+static inline int linkmode_equal(const unsigned long *src1,
+				 const unsigned long *src2)
+{
+	return bitmap_equal(src1, src2, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+#endif /* __LINKMODE_H */
diff --git a/include/linux/mii.h b/include/linux/mii.h
index 55000ee5c6ad..567047ef0309 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -10,6 +10,7 @@
 
 
 #include <linux/if.h>
+#include <linux/linkmode.h>
 #include <uapi/linux/mii.h>
 
 struct ethtool_cmd;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 192a1fa0c73b..d24cc46748e2 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -19,6 +19,7 @@
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
 #include <linux/ethtool.h>
+#include <linux/linkmode.h>
 #include <linux/mdio.h>
 #include <linux/mii.h>
 #include <linux/module.h>
-- 
cgit v1.2.3


From ab2a605fa621ecf4ec26603a237822f7772cfa28 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 29 Sep 2018 23:04:10 +0200
Subject: net: phy: Add phydev_warn()

Not all new style LINK_MODE bits can be converted into old style
SUPPORTED bits. We need to warn when such a conversion is attempted.
Add a helper for this.

Convert all pr_warn() calls to phydev_warn() where possible.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/at803x.c     |  2 +-
 drivers/net/phy/dp83640.c    |  7 ++++---
 drivers/net/phy/marvell.c    |  2 +-
 drivers/net/phy/marvell10g.c |  6 +++---
 drivers/net/phy/microchip.c  | 33 +++++++++++++++++----------------
 include/linux/phy.h          |  3 +++
 6 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index 411cf1072bae..e74a047a846e 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -357,7 +357,7 @@ static int at803x_aneg_done(struct phy_device *phydev)
 
 	/* check if the SGMII link is OK. */
 	if (!(phy_read(phydev, AT803X_PSSR) & AT803X_PSSR_MR_AN_COMPLETE)) {
-		pr_warn("803x_aneg_done: SGMII link is not ok\n");
+		phydev_warn(phydev, "803x_aneg_done: SGMII link is not ok\n");
 		aneg_done = 0;
 	}
 	/* switch back to copper page */
diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index 29aa8d772b0c..74cf356d8171 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -553,16 +553,17 @@ static void enable_status_frames(struct phy_device *phydev, bool on)
 	mutex_unlock(&clock->extreg_lock);
 
 	if (!phydev->attached_dev) {
-		pr_warn("expected to find an attached netdevice\n");
+		phydev_warn(phydev,
+			    "expected to find an attached netdevice\n");
 		return;
 	}
 
 	if (on) {
 		if (dev_mc_add(phydev->attached_dev, status_frame_dst))
-			pr_warn("failed to add mc address\n");
+			phydev_warn(phydev, "failed to add mc address\n");
 	} else {
 		if (dev_mc_del(phydev->attached_dev, status_frame_dst))
-			pr_warn("failed to delete mc address\n");
+			phydev_warn(phydev, "failed to delete mc address\n");
 	}
 }
 
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 24fc4a73c300..8872a430d74a 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -638,7 +638,7 @@ static void marvell_config_led(struct phy_device *phydev)
 	err = phy_write_paged(phydev, MII_MARVELL_LED_PAGE, MII_PHY_LED_CTRL,
 			      def_config);
 	if (err < 0)
-		pr_warn("Fail to config marvell phy LED.\n");
+		phydev_warn(phydev, "Fail to config marvell phy LED.\n");
 }
 
 static int marvell_config_init(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index f77a2d9e7f9d..f214834819dd 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -337,9 +337,9 @@ static int mv3310_config_init(struct phy_device *phydev)
 	}
 
 	if (!ethtool_convert_link_mode_to_legacy_u32(&mask, supported))
-		dev_warn(&phydev->mdio.dev,
-			 "PHY supports (%*pb) more modes than phylib supports, some modes not supported.\n",
-			 __ETHTOOL_LINK_MODE_MASK_NBITS, supported);
+		phydev_warn(phydev,
+			    "PHY supports (%*pb) more modes than phylib supports, some modes not supported.\n",
+			    __ETHTOOL_LINK_MODE_MASK_NBITS, supported);
 
 	phydev->supported &= mask;
 	phydev->advertising &= phydev->supported;
diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c
index 2d67937866a3..04b12e34da58 100644
--- a/drivers/net/phy/microchip.c
+++ b/drivers/net/phy/microchip.c
@@ -88,7 +88,7 @@ static int lan88xx_TR_reg_set(struct phy_device *phydev, u16 regaddr,
 	/* Save current page */
 	save_page = phy_save_page(phydev);
 	if (save_page < 0) {
-		pr_warn("Failed to get current page\n");
+		phydev_warn(phydev, "Failed to get current page\n");
 		goto err;
 	}
 
@@ -98,14 +98,14 @@ static int lan88xx_TR_reg_set(struct phy_device *phydev, u16 regaddr,
 	ret = __phy_write(phydev, LAN88XX_EXT_PAGE_TR_LOW_DATA,
 			  (data & 0xFFFF));
 	if (ret < 0) {
-		pr_warn("Failed to write TR low data\n");
+		phydev_warn(phydev, "Failed to write TR low data\n");
 		goto err;
 	}
 
 	ret = __phy_write(phydev, LAN88XX_EXT_PAGE_TR_HIGH_DATA,
 			  (data & 0x00FF0000) >> 16);
 	if (ret < 0) {
-		pr_warn("Failed to write TR high data\n");
+		phydev_warn(phydev, "Failed to write TR high data\n");
 		goto err;
 	}
 
@@ -115,14 +115,15 @@ static int lan88xx_TR_reg_set(struct phy_device *phydev, u16 regaddr,
 
 	ret = __phy_write(phydev, LAN88XX_EXT_PAGE_TR_CR, buf);
 	if (ret < 0) {
-		pr_warn("Failed to write data in reg\n");
+		phydev_warn(phydev, "Failed to write data in reg\n");
 		goto err;
 	}
 
 	usleep_range(1000, 2000);/* Wait for Data to be written */
 	val = __phy_read(phydev, LAN88XX_EXT_PAGE_TR_CR);
 	if (!(val & 0x8000))
-		pr_warn("TR Register[0x%X] configuration failed\n", regaddr);
+		phydev_warn(phydev, "TR Register[0x%X] configuration failed\n",
+			    regaddr);
 err:
 	return phy_restore_page(phydev, save_page, ret);
 }
@@ -137,7 +138,7 @@ static void lan88xx_config_TR_regs(struct phy_device *phydev)
 	 */
 	err = lan88xx_TR_reg_set(phydev, 0x0F82, 0x12B00A);
 	if (err < 0)
-		pr_warn("Failed to Set Register[0x0F82]\n");
+		phydev_warn(phydev, "Failed to Set Register[0x0F82]\n");
 
 	/* Get access to Channel b'10, Node b'1101, Register 0x06.
 	 * Write 24-bit value 0xD2C46F to register. Setting SSTrKf1000Slv,
@@ -145,7 +146,7 @@ static void lan88xx_config_TR_regs(struct phy_device *phydev)
 	 */
 	err = lan88xx_TR_reg_set(phydev, 0x168C, 0xD2C46F);
 	if (err < 0)
-		pr_warn("Failed to Set Register[0x168C]\n");
+		phydev_warn(phydev, "Failed to Set Register[0x168C]\n");
 
 	/* Get access to Channel b'10, Node b'1111, Register 0x11.
 	 * Write 24-bit value 0x620 to register. Setting rem_upd_done_thresh
@@ -153,7 +154,7 @@ static void lan88xx_config_TR_regs(struct phy_device *phydev)
 	 */
 	err = lan88xx_TR_reg_set(phydev, 0x17A2, 0x620);
 	if (err < 0)
-		pr_warn("Failed to Set Register[0x17A2]\n");
+		phydev_warn(phydev, "Failed to Set Register[0x17A2]\n");
 
 	/* Get access to Channel b'10, Node b'1101, Register 0x10.
 	 * Write 24-bit value 0xEEFFDD to register. Setting
@@ -162,7 +163,7 @@ static void lan88xx_config_TR_regs(struct phy_device *phydev)
 	 */
 	err = lan88xx_TR_reg_set(phydev, 0x16A0, 0xEEFFDD);
 	if (err < 0)
-		pr_warn("Failed to Set Register[0x16A0]\n");
+		phydev_warn(phydev, "Failed to Set Register[0x16A0]\n");
 
 	/* Get access to Channel b'10, Node b'1101, Register 0x13.
 	 * Write 24-bit value 0x071448 to register. Setting
@@ -170,7 +171,7 @@ static void lan88xx_config_TR_regs(struct phy_device *phydev)
 	 */
 	err = lan88xx_TR_reg_set(phydev, 0x16A6, 0x071448);
 	if (err < 0)
-		pr_warn("Failed to Set Register[0x16A6]\n");
+		phydev_warn(phydev, "Failed to Set Register[0x16A6]\n");
 
 	/* Get access to Channel b'10, Node b'1101, Register 0x12.
 	 * Write 24-bit value 0x13132F to register. Setting
@@ -178,7 +179,7 @@ static void lan88xx_config_TR_regs(struct phy_device *phydev)
 	 */
 	err = lan88xx_TR_reg_set(phydev, 0x16A4, 0x13132F);
 	if (err < 0)
-		pr_warn("Failed to Set Register[0x16A4]\n");
+		phydev_warn(phydev, "Failed to Set Register[0x16A4]\n");
 
 	/* Get access to Channel b'10, Node b'1101, Register 0x14.
 	 * Write 24-bit value 0x0 to register. Setting eee_3level_delay,
@@ -186,7 +187,7 @@ static void lan88xx_config_TR_regs(struct phy_device *phydev)
 	 */
 	err = lan88xx_TR_reg_set(phydev, 0x16A8, 0x0);
 	if (err < 0)
-		pr_warn("Failed to Set Register[0x16A8]\n");
+		phydev_warn(phydev, "Failed to Set Register[0x16A8]\n");
 
 	/* Get access to Channel b'01, Node b'1111, Register 0x34.
 	 * Write 24-bit value 0x91B06C to register. Setting
@@ -195,7 +196,7 @@ static void lan88xx_config_TR_regs(struct phy_device *phydev)
 	 */
 	err = lan88xx_TR_reg_set(phydev, 0x0FE8, 0x91B06C);
 	if (err < 0)
-		pr_warn("Failed to Set Register[0x0FE8]\n");
+		phydev_warn(phydev, "Failed to Set Register[0x0FE8]\n");
 
 	/* Get access to Channel b'01, Node b'1111, Register 0x3E.
 	 * Write 24-bit value 0xC0A028 to register. Setting
@@ -204,7 +205,7 @@ static void lan88xx_config_TR_regs(struct phy_device *phydev)
 	 */
 	err = lan88xx_TR_reg_set(phydev, 0x0FFC, 0xC0A028);
 	if (err < 0)
-		pr_warn("Failed to Set Register[0x0FFC]\n");
+		phydev_warn(phydev, "Failed to Set Register[0x0FFC]\n");
 
 	/* Get access to Channel b'01, Node b'1111, Register 0x35.
 	 * Write 24-bit value 0x041600 to register. Setting
@@ -213,14 +214,14 @@ static void lan88xx_config_TR_regs(struct phy_device *phydev)
 	 */
 	err = lan88xx_TR_reg_set(phydev, 0x0FEA, 0x041600);
 	if (err < 0)
-		pr_warn("Failed to Set Register[0x0FEA]\n");
+		phydev_warn(phydev, "Failed to Set Register[0x0FEA]\n");
 
 	/* Get access to Channel b'10, Node b'1101, Register 0x03.
 	 * Write 24-bit value 0x000004 to register. Setting TrFreeze bits.
 	 */
 	err = lan88xx_TR_reg_set(phydev, 0x1686, 0x000004);
 	if (err < 0)
-		pr_warn("Failed to Set Register[0x1686]\n");
+		phydev_warn(phydev, "Failed to Set Register[0x1686]\n");
 }
 
 static int lan88xx_probe(struct phy_device *phydev)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d24cc46748e2..0ab9f89773fd 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -968,6 +968,9 @@ static inline void phy_device_reset(struct phy_device *phydev, int value)
 #define phydev_err(_phydev, format, args...)	\
 	dev_err(&_phydev->mdio.dev, format, ##args)
 
+#define phydev_warn(_phydev, format, args...)	\
+	dev_warn(&_phydev->mdio.dev, format, ##args)
+
 #define phydev_dbg(_phydev, format, args...)	\
 	dev_dbg(&_phydev->mdio.dev, format, ##args)
 
-- 
cgit v1.2.3


From c4fabb8b3c0d724eb93dabaf346b0dd8a8be7118 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 29 Sep 2018 23:04:11 +0200
Subject: net: phy: Add phydev_info()

Add phydev_info() and make use of it within the phy drivers and core
code.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/dp83640.c    | 11 ++++++-----
 drivers/net/phy/phy_device.c |  4 ++--
 include/linux/phy.h          |  3 +++
 3 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index 74cf356d8171..edd4d44a386d 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -687,9 +687,9 @@ static void recalibrate(struct dp83640_clock *clock)
 	 * read out and correct offsets
 	 */
 	val = ext_read(master, PAGE4, PTP_STS);
-	pr_info("master PTP_STS  0x%04hx\n", val);
+	phydev_info(master, "master PTP_STS  0x%04hx\n", val);
 	val = ext_read(master, PAGE4, PTP_ESTS);
-	pr_info("master PTP_ESTS 0x%04hx\n", val);
+	phydev_info(master, "master PTP_ESTS 0x%04hx\n", val);
 	event_ts.ns_lo  = ext_read(master, PAGE4, PTP_EDATA);
 	event_ts.ns_hi  = ext_read(master, PAGE4, PTP_EDATA);
 	event_ts.sec_lo = ext_read(master, PAGE4, PTP_EDATA);
@@ -699,15 +699,16 @@ static void recalibrate(struct dp83640_clock *clock)
 	list_for_each(this, &clock->phylist) {
 		tmp = list_entry(this, struct dp83640_private, list);
 		val = ext_read(tmp->phydev, PAGE4, PTP_STS);
-		pr_info("slave  PTP_STS  0x%04hx\n", val);
+		phydev_info(tmp->phydev, "slave  PTP_STS  0x%04hx\n", val);
 		val = ext_read(tmp->phydev, PAGE4, PTP_ESTS);
-		pr_info("slave  PTP_ESTS 0x%04hx\n", val);
+		phydev_info(tmp->phydev, "slave  PTP_ESTS 0x%04hx\n", val);
 		event_ts.ns_lo  = ext_read(tmp->phydev, PAGE4, PTP_EDATA);
 		event_ts.ns_hi  = ext_read(tmp->phydev, PAGE4, PTP_EDATA);
 		event_ts.sec_lo = ext_read(tmp->phydev, PAGE4, PTP_EDATA);
 		event_ts.sec_hi = ext_read(tmp->phydev, PAGE4, PTP_EDATA);
 		diff = now - (s64) phy2txts(&event_ts);
-		pr_info("slave offset %lld nanoseconds\n", diff);
+		phydev_info(tmp->phydev, "slave offset %lld nanoseconds\n",
+			    diff);
 		diff += ADJTIME_FIX;
 		ts = ns_to_timespec64(diff);
 		tdr_write(0, tmp->phydev, &ts, PTP_STEP_CLK);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index ee676d75fe02..35102e17bbeb 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -920,13 +920,13 @@ void phy_attached_print(struct phy_device *phydev, const char *fmt, ...)
 
 
 	if (!fmt) {
-		dev_info(&phydev->mdio.dev, ATTACHED_FMT "\n",
+		phydev_info(phydev, ATTACHED_FMT "\n",
 			 drv_name, phydev_name(phydev),
 			 irq_str);
 	} else {
 		va_list ap;
 
-		dev_info(&phydev->mdio.dev, ATTACHED_FMT,
+		phydev_info(phydev, ATTACHED_FMT,
 			 drv_name, phydev_name(phydev),
 			 irq_str);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 0ab9f89773fd..0f6e7bf5e9c5 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -968,6 +968,9 @@ static inline void phy_device_reset(struct phy_device *phydev, int value)
 #define phydev_err(_phydev, format, args...)	\
 	dev_err(&_phydev->mdio.dev, format, ##args)
 
+#define phydev_info(_phydev, format, args...)	\
+	dev_info(&_phydev->mdio.dev, format, ##args)
+
 #define phydev_warn(_phydev, format, args...)	\
 	dev_warn(&_phydev->mdio.dev, format, ##args)
 
-- 
cgit v1.2.3


From edc7ccbbcf32b97c7d26cd556f364eb4f22c4285 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 29 Sep 2018 23:04:12 +0200
Subject: net: phy: Add helper to convert MII ADV register to a linkmode

The phy_mii_ioctl can be used to write a value into the MII_ADVERTISE
register in the PHY. Since this changes the state of the PHY, we need
to make the same change to phydev->advertising. Add a helper which can
convert the register value to a linkmode.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mii.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/linux/mii.h b/include/linux/mii.h
index 567047ef0309..8c7da9473ad9 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -303,6 +303,37 @@ static inline u32 mii_lpa_to_ethtool_lpa_x(u32 lpa)
 	return result | mii_adv_to_ethtool_adv_x(lpa);
 }
 
+/**
+ * mii_adv_to_linkmode_adv_t
+ * @advertising:pointer to destination link mode.
+ * @adv: value of the MII_ADVERTISE register
+ *
+ * A small helper function that translates MII_ADVERTISE bits
+ * to linkmode advertisement settings.
+ */
+static inline void mii_adv_to_linkmode_adv_t(unsigned long *advertising,
+					     u32 adv)
+{
+	linkmode_zero(advertising);
+
+	if (adv & ADVERTISE_10HALF)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT,
+				 advertising);
+	if (adv & ADVERTISE_10FULL)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT,
+				 advertising);
+	if (adv & ADVERTISE_100HALF)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT,
+				 advertising);
+	if (adv & ADVERTISE_100FULL)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+				 advertising);
+	if (adv & ADVERTISE_PAUSE_CAP)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising);
+	if (adv & ADVERTISE_PAUSE_ASYM)
+		linkmode_set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising);
+}
+
 /**
  * mii_advertise_flowctrl - get flow control advertisement flags
  * @cap: Flow control capabilities (FLOW_CTRL_RX, FLOW_CTRL_TX or both)
-- 
cgit v1.2.3


From 5f991f7bddc991ecc3c8a009ffd76fccff4661c7 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 29 Sep 2018 23:04:13 +0200
Subject: net: phy: Add helper for advertise to lcl value

Add a helper to convert the local advertising to an LCL capabilities,
which is then used to resolve pause flow control settings.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mt7530.c                              |  6 +-----
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c           |  5 +----
 drivers/net/ethernet/freescale/fman/mac.c             |  6 +-----
 drivers/net/ethernet/freescale/gianfar.c              |  7 +------
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c   |  6 +-----
 drivers/net/ethernet/mediatek/mtk_eth_soc.c           |  6 +-----
 drivers/net/ethernet/socionext/sni_ave.c              |  5 +----
 include/linux/mii.h                                   | 19 +++++++++++++++++++
 8 files changed, 26 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 62e486652e62..a5de9bffe5be 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -658,11 +658,7 @@ static void mt7530_adjust_link(struct dsa_switch *ds, int port,
 			if (phydev->asym_pause)
 				rmt_adv |= LPA_PAUSE_ASYM;
 
-			if (phydev->advertising & ADVERTISED_Pause)
-				lcl_adv |= ADVERTISE_PAUSE_CAP;
-			if (phydev->advertising & ADVERTISED_Asym_Pause)
-				lcl_adv |= ADVERTISE_PAUSE_ASYM;
-
+			lcl_adv = ethtool_adv_to_lcl_adv_t(phydev->advertising);
 			flowctrl = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv);
 
 			if (flowctrl & FLOW_CTRL_TX)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 289129011b9f..a7e03e3ecc93 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -1495,10 +1495,7 @@ static void xgbe_phy_phydev_flowctrl(struct xgbe_prv_data *pdata)
 	if (!phy_data->phydev)
 		return;
 
-	if (phy_data->phydev->advertising & ADVERTISED_Pause)
-		lcl_adv |= ADVERTISE_PAUSE_CAP;
-	if (phy_data->phydev->advertising & ADVERTISED_Asym_Pause)
-		lcl_adv |= ADVERTISE_PAUSE_ASYM;
+	lcl_adv = ethtool_adv_to_lcl_adv_t(phy_data->phydev->advertising);
 
 	if (phy_data->phydev->pause) {
 		XGBE_SET_LP_ADV(lks, Pause);
diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
index a847b9c3b31a..d79e4e009d63 100644
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -393,11 +393,7 @@ void fman_get_pause_cfg(struct mac_device *mac_dev, bool *rx_pause,
 	 */
 
 	/* get local capabilities */
-	lcl_adv = 0;
-	if (phy_dev->advertising & ADVERTISED_Pause)
-		lcl_adv |= ADVERTISE_PAUSE_CAP;
-	if (phy_dev->advertising & ADVERTISED_Asym_Pause)
-		lcl_adv |= ADVERTISE_PAUSE_ASYM;
+	lcl_adv = ethtool_adv_to_lcl_adv_t(phy_dev->advertising);
 
 	/* get link partner capabilities */
 	rmt_adv = 0;
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 0bd21a493016..3c8da1a18ba0 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -3656,12 +3656,7 @@ static u32 gfar_get_flowctrl_cfg(struct gfar_private *priv)
 		if (phydev->asym_pause)
 			rmt_adv |= LPA_PAUSE_ASYM;
 
-		lcl_adv = 0;
-		if (phydev->advertising & ADVERTISED_Pause)
-			lcl_adv |= ADVERTISE_PAUSE_CAP;
-		if (phydev->advertising & ADVERTISED_Asym_Pause)
-			lcl_adv |= ADVERTISE_PAUSE_ASYM;
-
+		lcl_adv = ethtool_adv_to_lcl_adv_t(phydev->advertising);
 		flowctrl = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv);
 		if (flowctrl & FLOW_CTRL_TX)
 			val |= MACCFG1_TX_FLOW;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 7c8b686b1ce1..c17ceeefa453 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5006,11 +5006,7 @@ int hclge_cfg_flowctrl(struct hclge_dev *hdev)
 	if (!phydev->link || !phydev->autoneg)
 		return 0;
 
-	if (phydev->advertising & ADVERTISED_Pause)
-		local_advertising = ADVERTISE_PAUSE_CAP;
-
-	if (phydev->advertising & ADVERTISED_Asym_Pause)
-		local_advertising |= ADVERTISE_PAUSE_ASYM;
+	local_advertising = ethtool_adv_to_lcl_adv_t(phydev->advertising);
 
 	if (phydev->pause)
 		remote_advertising = LPA_PAUSE_CAP;
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index cc1e9a96a43b..7dbfdac4067a 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -243,11 +243,7 @@ static void mtk_phy_link_adjust(struct net_device *dev)
 		if (dev->phydev->asym_pause)
 			rmt_adv |= LPA_PAUSE_ASYM;
 
-		if (dev->phydev->advertising & ADVERTISED_Pause)
-			lcl_adv |= ADVERTISE_PAUSE_CAP;
-		if (dev->phydev->advertising & ADVERTISED_Asym_Pause)
-			lcl_adv |= ADVERTISE_PAUSE_ASYM;
-
+		lcl_adv = ethtool_adv_to_lcl_adv_t(dev->phydev->advertising);
 		flowctrl = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv);
 
 		if (flowctrl & FLOW_CTRL_TX)
diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index 2a156dcd4534..6732f5cbde08 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -1116,11 +1116,8 @@ static void ave_phy_adjust_link(struct net_device *ndev)
 			rmt_adv |= LPA_PAUSE_CAP;
 		if (phydev->asym_pause)
 			rmt_adv |= LPA_PAUSE_ASYM;
-		if (phydev->advertising & ADVERTISED_Pause)
-			lcl_adv |= ADVERTISE_PAUSE_CAP;
-		if (phydev->advertising & ADVERTISED_Asym_Pause)
-			lcl_adv |= ADVERTISE_PAUSE_ASYM;
 
+		lcl_adv = ethtool_adv_to_lcl_adv_t(phydev->advertising);
 		cap = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv);
 		if (cap & FLOW_CTRL_TX)
 			txcr |= AVE_TXCR_FLOCTR;
diff --git a/include/linux/mii.h b/include/linux/mii.h
index 8c7da9473ad9..9ed49c8261d0 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -334,6 +334,25 @@ static inline void mii_adv_to_linkmode_adv_t(unsigned long *advertising,
 		linkmode_set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising);
 }
 
+/**
+ * ethtool_adv_to_lcl_adv_t
+ * @advertising:pointer to ethtool advertising
+ *
+ * A small helper function that translates ethtool advertising to LVL
+ * pause capabilities.
+ */
+static inline u32 ethtool_adv_to_lcl_adv_t(u32 advertising)
+{
+	u32 lcl_adv = 0;
+
+	if (advertising & ADVERTISED_Pause)
+		lcl_adv |= ADVERTISE_PAUSE_CAP;
+	if (advertising & ADVERTISED_Asym_Pause)
+		lcl_adv |= ADVERTISE_PAUSE_ASYM;
+
+	return lcl_adv;
+}
+
 /**
  * mii_advertise_flowctrl - get flow control advertisement flags
  * @cap: Flow control capabilities (FLOW_CTRL_RX, FLOW_CTRL_TX or both)
-- 
cgit v1.2.3


From f954a04ea18ebfcba1cd2756eaee59eb4978a20e Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 29 Sep 2018 23:04:14 +0200
Subject: net: phy: Add limkmode equivalents to some of the MII ethtool helpers

Add helpers which take a linkmode rather than a u32 ethtool for
advertising settings.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mii.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'include')

diff --git a/include/linux/mii.h b/include/linux/mii.h
index 9ed49c8261d0..2da85b02e1c0 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -132,6 +132,34 @@ static inline u32 ethtool_adv_to_mii_adv_t(u32 ethadv)
 	return result;
 }
 
+/**
+ * linkmode_adv_to_mii_adv_t
+ * @advertising: the linkmode advertisement settings
+ *
+ * A small helper function that translates linkmode advertisement
+ * settings to phy autonegotiation advertisements for the
+ * MII_ADVERTISE register.
+ */
+static inline u32 linkmode_adv_to_mii_adv_t(unsigned long *advertising)
+{
+	u32 result = 0;
+
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, advertising))
+		result |= ADVERTISE_10HALF;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, advertising))
+		result |= ADVERTISE_10FULL;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, advertising))
+		result |= ADVERTISE_100HALF;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, advertising))
+		result |= ADVERTISE_100FULL;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising))
+		result |= ADVERTISE_PAUSE_CAP;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising))
+		result |= ADVERTISE_PAUSE_ASYM;
+
+	return result;
+}
+
 /**
  * mii_adv_to_ethtool_adv_t
  * @adv: value of the MII_ADVERTISE register
@@ -179,6 +207,28 @@ static inline u32 ethtool_adv_to_mii_ctrl1000_t(u32 ethadv)
 	return result;
 }
 
+/**
+ * linkmode_adv_to_mii_ctrl1000_t
+ * advertising: the linkmode advertisement settings
+ *
+ * A small helper function that translates linkmode advertisement
+ * settings to phy autonegotiation advertisements for the
+ * MII_CTRL1000 register when in 1000T mode.
+ */
+static inline u32 linkmode_adv_to_mii_ctrl1000_t(unsigned long *advertising)
+{
+	u32 result = 0;
+
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT,
+			      advertising))
+		result |= ADVERTISE_1000HALF;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+			      advertising))
+		result |= ADVERTISE_1000FULL;
+
+	return result;
+}
+
 /**
  * mii_ctrl1000_to_ethtool_adv_t
  * @adv: value of the MII_CTRL1000 register
-- 
cgit v1.2.3


From 719655a149715f26fc4de904fe0aa83068bd5b9e Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 29 Sep 2018 23:04:16 +0200
Subject: net: phy: Replace phy driver features u32 with link_mode bitmap

This is one step in allowing phylib to make use of link_mode bitmaps,
instead of u32 for supported and advertised features. Convert the phy
drivers to use bitmaps to indicates the features they support.

Build bitmap equivalents of the u32 values at runtime, and have the
drivers point to the appropriate bitmap. These bitmaps are shared, and
we don't want a driver to modify them. So mark them __ro_after_init.

Within phylib, the features bitmap is currently turned back into a
u32. This will be removed once the whole of phylib, and the drivers
are converted to use bitmaps.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/pxa168_eth.c |   4 +-
 drivers/net/phy/aquantia.c                |  12 +--
 drivers/net/phy/bcm63xx.c                 |   9 +-
 drivers/net/phy/marvell.c                 |   2 +-
 drivers/net/phy/marvell10g.c              |  11 +-
 drivers/net/phy/microchip_t1.c            |   2 +-
 drivers/net/phy/phy_device.c              | 164 ++++++++++++++++++++++++++++--
 include/linux/linkmode.h                  |   9 ++
 include/linux/phy.h                       |  24 +++--
 9 files changed, 198 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c
index ff2fea0f8b75..0bd4351b2a49 100644
--- a/drivers/net/ethernet/marvell/pxa168_eth.c
+++ b/drivers/net/ethernet/marvell/pxa168_eth.c
@@ -988,8 +988,8 @@ static int pxa168_init_phy(struct net_device *dev)
 	cmd.base.phy_address = pep->phy_addr;
 	cmd.base.speed = pep->phy_speed;
 	cmd.base.duplex = pep->phy_duplex;
-	ethtool_convert_legacy_u32_to_link_mode(cmd.link_modes.advertising,
-						PHY_BASIC_FEATURES);
+	bitmap_copy(cmd.link_modes.advertising, PHY_BASIC_FEATURES,
+		    __ETHTOOL_LINK_MODE_MASK_NBITS);
 	cmd.base.autoneg = AUTONEG_ENABLE;
 
 	if (cmd.base.speed != 0)
diff --git a/drivers/net/phy/aquantia.c b/drivers/net/phy/aquantia.c
index 319edc9c8ec7..632472cab3bb 100644
--- a/drivers/net/phy/aquantia.c
+++ b/drivers/net/phy/aquantia.c
@@ -115,7 +115,7 @@ static struct phy_driver aquantia_driver[] = {
 	.phy_id		= PHY_ID_AQ1202,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Aquantia AQ1202",
-	.features	= PHY_AQUANTIA_FEATURES,
+	.features	= PHY_10GBIT_FULL_FEATURES,
 	.flags		= PHY_HAS_INTERRUPT,
 	.aneg_done	= genphy_c45_aneg_done,
 	.config_aneg    = aquantia_config_aneg,
@@ -127,7 +127,7 @@ static struct phy_driver aquantia_driver[] = {
 	.phy_id		= PHY_ID_AQ2104,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Aquantia AQ2104",
-	.features	= PHY_AQUANTIA_FEATURES,
+	.features	= PHY_10GBIT_FULL_FEATURES,
 	.flags		= PHY_HAS_INTERRUPT,
 	.aneg_done	= genphy_c45_aneg_done,
 	.config_aneg    = aquantia_config_aneg,
@@ -139,7 +139,7 @@ static struct phy_driver aquantia_driver[] = {
 	.phy_id		= PHY_ID_AQR105,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Aquantia AQR105",
-	.features	= PHY_AQUANTIA_FEATURES,
+	.features	= PHY_10GBIT_FULL_FEATURES,
 	.flags		= PHY_HAS_INTERRUPT,
 	.aneg_done	= genphy_c45_aneg_done,
 	.config_aneg    = aquantia_config_aneg,
@@ -151,7 +151,7 @@ static struct phy_driver aquantia_driver[] = {
 	.phy_id		= PHY_ID_AQR106,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Aquantia AQR106",
-	.features	= PHY_AQUANTIA_FEATURES,
+	.features	= PHY_10GBIT_FULL_FEATURES,
 	.flags		= PHY_HAS_INTERRUPT,
 	.aneg_done	= genphy_c45_aneg_done,
 	.config_aneg    = aquantia_config_aneg,
@@ -163,7 +163,7 @@ static struct phy_driver aquantia_driver[] = {
 	.phy_id		= PHY_ID_AQR107,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Aquantia AQR107",
-	.features	= PHY_AQUANTIA_FEATURES,
+	.features	= PHY_10GBIT_FULL_FEATURES,
 	.flags		= PHY_HAS_INTERRUPT,
 	.aneg_done	= genphy_c45_aneg_done,
 	.config_aneg    = aquantia_config_aneg,
@@ -175,7 +175,7 @@ static struct phy_driver aquantia_driver[] = {
 	.phy_id		= PHY_ID_AQR405,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Aquantia AQR405",
-	.features	= PHY_AQUANTIA_FEATURES,
+	.features	= PHY_10GBIT_FULL_FEATURES,
 	.flags		= PHY_HAS_INTERRUPT,
 	.aneg_done	= genphy_c45_aneg_done,
 	.config_aneg    = aquantia_config_aneg,
diff --git a/drivers/net/phy/bcm63xx.c b/drivers/net/phy/bcm63xx.c
index cf14613745c9..d95bffdec4c1 100644
--- a/drivers/net/phy/bcm63xx.c
+++ b/drivers/net/phy/bcm63xx.c
@@ -42,6 +42,9 @@ static int bcm63xx_config_init(struct phy_device *phydev)
 {
 	int reg, err;
 
+	/* ASYM_PAUSE bit is marked RO in datasheet, so don't cheat */
+	phydev->supported |= SUPPORTED_Pause;
+
 	reg = phy_read(phydev, MII_BCM63XX_IR);
 	if (reg < 0)
 		return reg;
@@ -65,8 +68,7 @@ static struct phy_driver bcm63xx_driver[] = {
 	.phy_id		= 0x00406000,
 	.phy_id_mask	= 0xfffffc00,
 	.name		= "Broadcom BCM63XX (1)",
-	/* ASYM_PAUSE bit is marked RO in datasheet, so don't cheat */
-	.features	= (PHY_BASIC_FEATURES | SUPPORTED_Pause),
+	.features	= PHY_BASIC_FEATURES,
 	.flags		= PHY_HAS_INTERRUPT | PHY_IS_INTERNAL,
 	.config_init	= bcm63xx_config_init,
 	.ack_interrupt	= bcm_phy_ack_intr,
@@ -75,8 +77,7 @@ static struct phy_driver bcm63xx_driver[] = {
 	/* same phy as above, with just a different OUI */
 	.phy_id		= 0x002bdc00,
 	.phy_id_mask	= 0xfffffc00,
-	.name		= "Broadcom BCM63XX (2)",
-	.features	= (PHY_BASIC_FEATURES | SUPPORTED_Pause),
+	.features	= PHY_BASIC_FEATURES,
 	.flags		= PHY_HAS_INTERRUPT | PHY_IS_INTERNAL,
 	.config_init	= bcm63xx_config_init,
 	.ack_interrupt	= bcm_phy_ack_intr,
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 8872a430d74a..cbec296107bd 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -2201,7 +2201,7 @@ static struct phy_driver marvell_drivers[] = {
 		.phy_id = MARVELL_PHY_ID_88E1510,
 		.phy_id_mask = MARVELL_PHY_ID_MASK,
 		.name = "Marvell 88E1510",
-		.features = PHY_GBIT_FEATURES | SUPPORTED_FIBRE,
+		.features = PHY_GBIT_FIBRE_FEATURES,
 		.flags = PHY_HAS_INTERRUPT,
 		.probe = &m88e1510_probe,
 		.config_init = &m88e1510_config_init,
diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index f214834819dd..1c9d039eec63 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -535,16 +535,7 @@ static struct phy_driver mv3310_drivers[] = {
 		.phy_id		= 0x002b09aa,
 		.phy_id_mask	= MARVELL_PHY_ID_MASK,
 		.name		= "mv88x3310",
-		.features	= SUPPORTED_10baseT_Full |
-				  SUPPORTED_10baseT_Half |
-				  SUPPORTED_100baseT_Full |
-				  SUPPORTED_100baseT_Half |
-				  SUPPORTED_1000baseT_Full |
-				  SUPPORTED_Autoneg |
-				  SUPPORTED_TP |
-				  SUPPORTED_FIBRE |
-				  SUPPORTED_10000baseT_Full |
-				  SUPPORTED_Backplane,
+		.features	= PHY_10GBIT_FEATURES,
 		.soft_reset	= gen10g_no_soft_reset,
 		.config_init	= mv3310_config_init,
 		.probe		= mv3310_probe,
diff --git a/drivers/net/phy/microchip_t1.c b/drivers/net/phy/microchip_t1.c
index b1917dd1978a..c600a8509d60 100644
--- a/drivers/net/phy/microchip_t1.c
+++ b/drivers/net/phy/microchip_t1.c
@@ -46,7 +46,7 @@ static struct phy_driver microchip_t1_phy_driver[] = {
 		.phy_id_mask    = 0xfffffff0,
 		.name           = "Microchip LAN87xx T1",
 
-		.features       = SUPPORTED_100baseT_Full,
+		.features       = PHY_BASIC_T1_FEATURES,
 		.flags          = PHY_HAS_INTERRUPT,
 
 		.config_init    = genphy_config_init,
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 35102e17bbeb..f53ce65f45c5 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/mii.h>
 #include <linux/ethtool.h>
+#include <linux/bitmap.h>
 #include <linux/phy.h>
 #include <linux/phy_led_triggers.h>
 #include <linux/mdio.h>
@@ -42,6 +43,149 @@ MODULE_DESCRIPTION("PHY library");
 MODULE_AUTHOR("Andy Fleming");
 MODULE_LICENSE("GPL");
 
+__ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_features) __ro_after_init;
+EXPORT_SYMBOL_GPL(phy_basic_features);
+
+__ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1_features) __ro_after_init;
+EXPORT_SYMBOL_GPL(phy_basic_t1_features);
+
+__ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_features) __ro_after_init;
+EXPORT_SYMBOL_GPL(phy_gbit_features);
+
+__ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_fibre_features) __ro_after_init;
+EXPORT_SYMBOL_GPL(phy_gbit_fibre_features);
+
+__ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_all_ports_features) __ro_after_init;
+EXPORT_SYMBOL_GPL(phy_gbit_all_ports_features);
+
+__ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_features) __ro_after_init;
+EXPORT_SYMBOL_GPL(phy_10gbit_features);
+
+static const int phy_basic_ports_array[] = {
+	ETHTOOL_LINK_MODE_Autoneg_BIT,
+	ETHTOOL_LINK_MODE_TP_BIT,
+	ETHTOOL_LINK_MODE_MII_BIT,
+};
+
+static const int phy_fibre_port_array[] = {
+	ETHTOOL_LINK_MODE_FIBRE_BIT,
+};
+
+static const int phy_all_ports_features_array[] = {
+	ETHTOOL_LINK_MODE_Autoneg_BIT,
+	ETHTOOL_LINK_MODE_TP_BIT,
+	ETHTOOL_LINK_MODE_MII_BIT,
+	ETHTOOL_LINK_MODE_FIBRE_BIT,
+	ETHTOOL_LINK_MODE_AUI_BIT,
+	ETHTOOL_LINK_MODE_BNC_BIT,
+	ETHTOOL_LINK_MODE_Backplane_BIT,
+};
+
+static const int phy_10_100_features_array[] = {
+	ETHTOOL_LINK_MODE_10baseT_Half_BIT,
+	ETHTOOL_LINK_MODE_10baseT_Full_BIT,
+	ETHTOOL_LINK_MODE_100baseT_Half_BIT,
+	ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+};
+
+static const int phy_basic_t1_features_array[] = {
+	ETHTOOL_LINK_MODE_TP_BIT,
+	ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+};
+
+static const int phy_gbit_features_array[] = {
+	ETHTOOL_LINK_MODE_1000baseT_Half_BIT,
+	ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+};
+
+static const int phy_10gbit_features_array[] = {
+	ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
+};
+
+__ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_init;
+EXPORT_SYMBOL_GPL(phy_10gbit_full_features);
+
+static const int phy_10gbit_full_features_array[] = {
+	ETHTOOL_LINK_MODE_10baseT_Full_BIT,
+	ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+	ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+	ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
+};
+
+static void features_init(void)
+{
+	/* 10/100 half/full*/
+	linkmode_set_bit_array(phy_basic_ports_array,
+			       ARRAY_SIZE(phy_basic_ports_array),
+			       phy_basic_features);
+	linkmode_set_bit_array(phy_10_100_features_array,
+			       ARRAY_SIZE(phy_10_100_features_array),
+			       phy_basic_features);
+
+	/* 100 full, TP */
+	linkmode_set_bit_array(phy_basic_t1_features_array,
+			       ARRAY_SIZE(phy_basic_t1_features_array),
+			       phy_basic_t1_features);
+
+	/* 10/100 half/full + 1000 half/full */
+	linkmode_set_bit_array(phy_basic_ports_array,
+			       ARRAY_SIZE(phy_basic_ports_array),
+			       phy_gbit_features);
+	linkmode_set_bit_array(phy_10_100_features_array,
+			       ARRAY_SIZE(phy_10_100_features_array),
+			       phy_gbit_features);
+	linkmode_set_bit_array(phy_gbit_features_array,
+			       ARRAY_SIZE(phy_gbit_features_array),
+			       phy_gbit_features);
+
+	/* 10/100 half/full + 1000 half/full + fibre*/
+	linkmode_set_bit_array(phy_basic_ports_array,
+			       ARRAY_SIZE(phy_basic_ports_array),
+			       phy_gbit_fibre_features);
+	linkmode_set_bit_array(phy_10_100_features_array,
+			       ARRAY_SIZE(phy_10_100_features_array),
+			       phy_gbit_fibre_features);
+	linkmode_set_bit_array(phy_gbit_features_array,
+			       ARRAY_SIZE(phy_gbit_features_array),
+			       phy_gbit_fibre_features);
+	linkmode_set_bit_array(phy_fibre_port_array,
+			       ARRAY_SIZE(phy_fibre_port_array),
+			       phy_gbit_fibre_features);
+
+	/* 10/100 half/full + 1000 half/full + TP/MII/FIBRE/AUI/BNC/Backplane*/
+	linkmode_set_bit_array(phy_all_ports_features_array,
+			       ARRAY_SIZE(phy_all_ports_features_array),
+			       phy_gbit_all_ports_features);
+	linkmode_set_bit_array(phy_10_100_features_array,
+			       ARRAY_SIZE(phy_10_100_features_array),
+			       phy_gbit_all_ports_features);
+	linkmode_set_bit_array(phy_gbit_features_array,
+			       ARRAY_SIZE(phy_gbit_features_array),
+			       phy_gbit_all_ports_features);
+
+	/* 10/100 half/full + 1000 half/full + 10G full*/
+	linkmode_set_bit_array(phy_all_ports_features_array,
+			       ARRAY_SIZE(phy_all_ports_features_array),
+			       phy_10gbit_features);
+	linkmode_set_bit_array(phy_10_100_features_array,
+			       ARRAY_SIZE(phy_10_100_features_array),
+			       phy_10gbit_features);
+	linkmode_set_bit_array(phy_gbit_features_array,
+			       ARRAY_SIZE(phy_gbit_features_array),
+			       phy_10gbit_features);
+	linkmode_set_bit_array(phy_10gbit_features_array,
+			       ARRAY_SIZE(phy_10gbit_features_array),
+			       phy_10gbit_features);
+
+	/* 10/100/1000/10G full */
+	linkmode_set_bit_array(phy_all_ports_features_array,
+			       ARRAY_SIZE(phy_all_ports_features_array),
+			       phy_10gbit_full_features);
+	linkmode_set_bit_array(phy_10gbit_full_features_array,
+			       ARRAY_SIZE(phy_10gbit_full_features_array),
+			       phy_10gbit_full_features);
+}
+
 void phy_device_free(struct phy_device *phydev)
 {
 	put_device(&phydev->mdio.dev);
@@ -1936,6 +2080,7 @@ static int phy_probe(struct device *dev)
 	struct phy_device *phydev = to_phy_device(dev);
 	struct device_driver *drv = phydev->mdio.dev.driver;
 	struct phy_driver *phydrv = to_phy_driver(drv);
+	u32 features;
 	int err = 0;
 
 	phydev->drv = phydrv;
@@ -1956,7 +2101,8 @@ static int phy_probe(struct device *dev)
 	 * a controller will attach, and may modify one
 	 * or both of these values
 	 */
-	phydev->supported = phydrv->features;
+	ethtool_convert_link_mode_to_legacy_u32(&features, phydrv->features);
+	phydev->supported = features;
 	of_set_phy_supported(phydev);
 	phydev->advertising = phydev->supported;
 
@@ -1976,10 +2122,14 @@ static int phy_probe(struct device *dev)
 	 * (e.g. hardware erratum) where the driver wants to set only one
 	 * of these bits.
 	 */
-	if (phydrv->features & (SUPPORTED_Pause | SUPPORTED_Asym_Pause)) {
+	if (test_bit(ETHTOOL_LINK_MODE_Pause_BIT, phydrv->features) ||
+	    test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, phydrv->features)) {
 		phydev->supported &= ~(SUPPORTED_Pause | SUPPORTED_Asym_Pause);
-		phydev->supported |= phydrv->features &
-				     (SUPPORTED_Pause | SUPPORTED_Asym_Pause);
+		if (test_bit(ETHTOOL_LINK_MODE_Pause_BIT, phydrv->features))
+			phydev->supported |= SUPPORTED_Pause;
+		if (test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+			     phydrv->features))
+			phydev->supported |= SUPPORTED_Asym_Pause;
 	} else {
 		phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
 	}
@@ -2092,9 +2242,7 @@ static struct phy_driver genphy_driver = {
 	.name		= "Generic PHY",
 	.soft_reset	= genphy_no_soft_reset,
 	.config_init	= genphy_config_init,
-	.features	= PHY_GBIT_FEATURES | SUPPORTED_MII |
-			  SUPPORTED_AUI | SUPPORTED_FIBRE |
-			  SUPPORTED_BNC,
+	.features	= PHY_GBIT_ALL_PORTS_FEATURES,
 	.aneg_done	= genphy_aneg_done,
 	.suspend	= genphy_suspend,
 	.resume		= genphy_resume,
@@ -2109,6 +2257,8 @@ static int __init phy_init(void)
 	if (rc)
 		return rc;
 
+	features_init();
+
 	rc = phy_driver_register(&genphy_10g_driver, THIS_MODULE);
 	if (rc)
 		goto err_10g;
diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index 014fb86c7114..22443d7fb5cd 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -43,6 +43,15 @@ static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
 	__set_bit(nr, addr);
 }
 
+static inline void linkmode_set_bit_array(const int *array, int array_size,
+					  unsigned long *addr)
+{
+	int i;
+
+	for (i = 0; i < array_size; i++)
+		linkmode_set_bit(array[i], addr);
+}
+
 static inline void linkmode_clear_bit(int nr, volatile unsigned long *addr)
 {
 	__clear_bit(nr, addr);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 0f6e7bf5e9c5..dff51dd36e52 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -42,13 +42,21 @@
 #define PHY_1000BT_FEATURES	(SUPPORTED_1000baseT_Half | \
 				 SUPPORTED_1000baseT_Full)
 
-#define PHY_BASIC_FEATURES	(PHY_10BT_FEATURES | \
-				 PHY_100BT_FEATURES | \
-				 PHY_DEFAULT_FEATURES)
-
-#define PHY_GBIT_FEATURES	(PHY_BASIC_FEATURES | \
-				 PHY_1000BT_FEATURES)
-
+extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_features) __ro_after_init;
+extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1_features) __ro_after_init;
+extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_features) __ro_after_init;
+extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_fibre_features) __ro_after_init;
+extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_all_ports_features) __ro_after_init;
+extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_features) __ro_after_init;
+extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_init;
+
+#define PHY_BASIC_FEATURES ((unsigned long *)&phy_basic_features)
+#define PHY_BASIC_T1_FEATURES ((unsigned long *)&phy_basic_t1_features)
+#define PHY_GBIT_FEATURES ((unsigned long *)&phy_gbit_features)
+#define PHY_GBIT_FIBRE_FEATURES ((unsigned long *)&phy_gbit_fibre_features)
+#define PHY_GBIT_ALL_PORTS_FEATURES ((unsigned long *)&phy_gbit_all_ports_features)
+#define PHY_10GBIT_FEATURES ((unsigned long *)&phy_10gbit_features)
+#define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features)
 
 /*
  * Set phydev->irq to PHY_POLL if interrupts are not supported,
@@ -510,7 +518,7 @@ struct phy_driver {
 	u32 phy_id;
 	char *name;
 	u32 phy_id_mask;
-	u32 features;
+	const unsigned long * const features;
 	u32 flags;
 	const void *driver_data;
 
-- 
cgit v1.2.3


From 3e48be05f3c7eb6f6126939e9d957903c5cfeee5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 27 Sep 2018 11:28:35 +0200
Subject: netlink: add attribute range validation to policy

Without further bloating the policy structs, we can overload
the `validation_data' pointer with a struct of s16 min, max
and use those to validate ranges in NLA_{U,S}{8,16,32,64}
attributes.

It may sound strange to validate NLA_U32 with a s16 max, but
in many cases NLA_U32 is used for enums etc. since there's no
size benefit in using a smaller attribute width anyway, due
to netlink attribute alignment; in cases like that it's still
useful, particularly when the attribute really transports an
enum value.

Doing so lets us remove quite a bit of validation code, if we
can be sure that these attributes aren't used by userspace in
places where they're ignored today.

To achieve all this, split the 'type' field and introduce a
new 'validation_type' field which indicates what further
validation (beyond the validation prescribed by the type of
the attribute) is done. This currently allows for no further
validation (the default), as well as min, max and range checks.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 67 ++++++++++++++++++++++++++++++++++++++++++++---
 lib/nlattr.c          | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 3698ca8ff92c..d34ceeba82a8 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -188,9 +188,19 @@ enum {
 
 #define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1)
 
+enum nla_policy_validation {
+	NLA_VALIDATE_NONE,
+	NLA_VALIDATE_RANGE,
+	NLA_VALIDATE_MIN,
+	NLA_VALIDATE_MAX,
+};
+
 /**
  * struct nla_policy - attribute validation policy
  * @type: Type of attribute or NLA_UNSPEC
+ * @validation_type: type of attribute validation done in addition to
+ *	type-specific validation (e.g. range), see
+ *	&enum nla_policy_validation
  * @len: Type specific length of payload
  *
  * Policies are defined as arrays of this struct, the array must be
@@ -238,7 +248,26 @@ enum {
  *                         nested attributes directly inside, while an array has
  *                         the nested attributes at another level down and the
  *                         attributes directly in the nesting don't matter.
- *    All other            Unused
+ *    All other            Unused - but note that it's a union
+ *
+ * Meaning of `min' and `max' fields, use via NLA_POLICY_MIN, NLA_POLICY_MAX
+ * and NLA_POLICY_RANGE:
+ *    NLA_U8,
+ *    NLA_U16,
+ *    NLA_U32,
+ *    NLA_U64,
+ *    NLA_S8,
+ *    NLA_S16,
+ *    NLA_S32,
+ *    NLA_S64              These are used depending on the validation_type
+ *                         field, if that is min/max/range then the minimum,
+ *                         maximum and both are used (respectively) to check
+ *                         the value of the integer attribute.
+ *                         Note that in the interest of code simplicity and
+ *                         struct size both limits are s16, so you cannot
+ *                         enforce a range that doesn't fall within the range
+ *                         of s16 - do that as usual in the code instead.
+ *    All other            Unused - but note that it's a union
  *
  * Example:
  * static const struct nla_policy my_policy[ATTR_MAX+1] = {
@@ -249,9 +278,15 @@ enum {
  * };
  */
 struct nla_policy {
-	u16		type;
+	u8		type;
+	u8		validation_type;
 	u16		len;
-	const void     *validation_data;
+	union {
+		const void *validation_data;
+		struct {
+			s16 min, max;
+		};
+	};
 };
 
 #define NLA_POLICY_EXACT_LEN(_len)	{ .type = NLA_EXACT_LEN, .len = _len }
@@ -266,6 +301,32 @@ struct nla_policy {
 #define NLA_POLICY_NESTED_ARRAY(maxattr, policy) \
 	{ .type = NLA_NESTED_ARRAY, .validation_data = policy, .len = maxattr }
 
+#define __NLA_ENSURE(condition) (sizeof(char[1 - 2*!(condition)]) - 1)
+#define NLA_ENSURE_INT_TYPE(tp)				\
+	(__NLA_ENSURE(tp == NLA_S8 || tp == NLA_U8 ||	\
+		      tp == NLA_S16 || tp == NLA_U16 ||	\
+		      tp == NLA_S32 || tp == NLA_U32 ||	\
+		      tp == NLA_S64 || tp == NLA_U64) + tp)
+
+#define NLA_POLICY_RANGE(tp, _min, _max) {		\
+	.type = NLA_ENSURE_INT_TYPE(tp),		\
+	.validation_type = NLA_VALIDATE_RANGE,		\
+	.min = _min,					\
+	.max = _max					\
+}
+
+#define NLA_POLICY_MIN(tp, _min) {			\
+	.type = NLA_ENSURE_INT_TYPE(tp),		\
+	.validation_type = NLA_VALIDATE_MIN,		\
+	.min = _min,					\
+}
+
+#define NLA_POLICY_MAX(tp, _max) {			\
+	.type = NLA_ENSURE_INT_TYPE(tp),		\
+	.validation_type = NLA_VALIDATE_MAX,		\
+	.max = _max,					\
+}
+
 /**
  * struct nl_info - netlink source information
  * @nlh: Netlink message header of original request
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 2f8feff669a7..5670e4b7dfef 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -95,6 +95,64 @@ static int nla_validate_array(const struct nlattr *head, int len, int maxtype,
 	return 0;
 }
 
+static int nla_validate_int_range(const struct nla_policy *pt,
+				  const struct nlattr *nla,
+				  struct netlink_ext_ack *extack)
+{
+	bool validate_min, validate_max;
+	s64 value;
+
+	validate_min = pt->validation_type == NLA_VALIDATE_RANGE ||
+		       pt->validation_type == NLA_VALIDATE_MIN;
+	validate_max = pt->validation_type == NLA_VALIDATE_RANGE ||
+		       pt->validation_type == NLA_VALIDATE_MAX;
+
+	switch (pt->type) {
+	case NLA_U8:
+		value = nla_get_u8(nla);
+		break;
+	case NLA_U16:
+		value = nla_get_u16(nla);
+		break;
+	case NLA_U32:
+		value = nla_get_u32(nla);
+		break;
+	case NLA_S8:
+		value = nla_get_s8(nla);
+		break;
+	case NLA_S16:
+		value = nla_get_s16(nla);
+		break;
+	case NLA_S32:
+		value = nla_get_s32(nla);
+		break;
+	case NLA_S64:
+		value = nla_get_s64(nla);
+		break;
+	case NLA_U64:
+		/* treat this one specially, since it may not fit into s64 */
+		if ((validate_min && nla_get_u64(nla) < pt->min) ||
+		    (validate_max && nla_get_u64(nla) > pt->max)) {
+			NL_SET_ERR_MSG_ATTR(extack, nla,
+					    "integer out of range");
+			return -ERANGE;
+		}
+		return 0;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if ((validate_min && value < pt->min) ||
+	    (validate_max && value > pt->max)) {
+		NL_SET_ERR_MSG_ATTR(extack, nla,
+				    "integer out of range");
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
 static int validate_nla(const struct nlattr *nla, int maxtype,
 			const struct nla_policy *policy,
 			struct netlink_ext_ack *extack)
@@ -230,6 +288,20 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 			goto out_err;
 	}
 
+	/* further validation */
+	switch (pt->validation_type) {
+	case NLA_VALIDATE_NONE:
+		/* nothing to do */
+		break;
+	case NLA_VALIDATE_RANGE:
+	case NLA_VALIDATE_MIN:
+	case NLA_VALIDATE_MAX:
+		err = nla_validate_int_range(pt, nla, extack);
+		if (err)
+			return err;
+		break;
+	}
+
 	return 0;
 out_err:
 	NL_SET_ERR_MSG_ATTR(extack, nla, "Attribute failed policy validation");
-- 
cgit v1.2.3


From 33188bd6430ef06d206ae4fda2cc92f14f16fd20 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 27 Sep 2018 11:28:36 +0200
Subject: netlink: add validation function to policy

Add the ability to have an arbitrary validation function attached
to a netlink policy that doesn't already use the validation_data
pointer in another way.

This can be useful to validate for example the content of a binary
attribute, like in nl80211 the "(information) elements", which must
be valid streams of "u8 type, u8 length, u8 value[length]".

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 24 +++++++++++++++++++++++-
 lib/nlattr.c          |  7 +++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index d34ceeba82a8..6a106ef5ca56 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -193,13 +193,14 @@ enum nla_policy_validation {
 	NLA_VALIDATE_RANGE,
 	NLA_VALIDATE_MIN,
 	NLA_VALIDATE_MAX,
+	NLA_VALIDATE_FUNCTION,
 };
 
 /**
  * struct nla_policy - attribute validation policy
  * @type: Type of attribute or NLA_UNSPEC
  * @validation_type: type of attribute validation done in addition to
- *	type-specific validation (e.g. range), see
+ *	type-specific validation (e.g. range, function call), see
  *	&enum nla_policy_validation
  * @len: Type specific length of payload
  *
@@ -269,6 +270,13 @@ enum nla_policy_validation {
  *                         of s16 - do that as usual in the code instead.
  *    All other            Unused - but note that it's a union
  *
+ * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN:
+ *    NLA_BINARY           Validation function called for the attribute,
+ *                         not compatible with use of the validation_data
+ *                         as in NLA_BITFIELD32, NLA_REJECT, NLA_NESTED and
+ *                         NLA_NESTED_ARRAY.
+ *    All other            Unused - but note that it's a union
+ *
  * Example:
  * static const struct nla_policy my_policy[ATTR_MAX+1] = {
  * 	[ATTR_FOO] = { .type = NLA_U16 },
@@ -286,6 +294,8 @@ struct nla_policy {
 		struct {
 			s16 min, max;
 		};
+		int (*validate)(const struct nlattr *attr,
+				struct netlink_ext_ack *extack);
 	};
 };
 
@@ -307,6 +317,11 @@ struct nla_policy {
 		      tp == NLA_S16 || tp == NLA_U16 ||	\
 		      tp == NLA_S32 || tp == NLA_U32 ||	\
 		      tp == NLA_S64 || tp == NLA_U64) + tp)
+#define NLA_ENSURE_NO_VALIDATION_PTR(tp)		\
+	(__NLA_ENSURE(tp != NLA_BITFIELD32 &&		\
+		      tp != NLA_REJECT &&		\
+		      tp != NLA_NESTED &&		\
+		      tp != NLA_NESTED_ARRAY) + tp)
 
 #define NLA_POLICY_RANGE(tp, _min, _max) {		\
 	.type = NLA_ENSURE_INT_TYPE(tp),		\
@@ -327,6 +342,13 @@ struct nla_policy {
 	.max = _max,					\
 }
 
+#define NLA_POLICY_VALIDATE_FN(tp, fn, ...) {		\
+	.type = NLA_ENSURE_NO_VALIDATION_PTR(tp),	\
+	.validation_type = NLA_VALIDATE_FUNCTION,	\
+	.validate = fn,					\
+	.len = __VA_ARGS__ + 0,				\
+}
+
 /**
  * struct nl_info - netlink source information
  * @nlh: Netlink message header of original request
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 5670e4b7dfef..1e900bb414ef 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -300,6 +300,13 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 		if (err)
 			return err;
 		break;
+	case NLA_VALIDATE_FUNCTION:
+		if (pt->validate) {
+			err = pt->validate(nla, extack);
+			if (err)
+				return err;
+		}
+		break;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 9f2959b6b52d43326b2f6a0e0d7ffe6f4fc3b5ca Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 28 Sep 2018 08:51:09 +0200
Subject: net: phy: improve handling delayed work

Using mod_delayed_work() allows to simplify handling delayed work and
removes the need for the sync parameter in phy_trigger_machine().
Also introduce a helper phy_queue_state_machine() to encapsulate the
low-level delayed work calls. No functional change intended.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 29 +++++++++++++++--------------
 include/linux/phy.h   |  2 +-
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index a1f8e4816f72..14509a8903c6 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -537,7 +537,7 @@ out_unlock:
 	mutex_unlock(&phydev->lock);
 
 	if (trigger)
-		phy_trigger_machine(phydev, sync);
+		phy_trigger_machine(phydev);
 
 	return err;
 }
@@ -635,6 +635,13 @@ int phy_speed_up(struct phy_device *phydev)
 }
 EXPORT_SYMBOL_GPL(phy_speed_up);
 
+static void phy_queue_state_machine(struct phy_device *phydev,
+				    unsigned int secs)
+{
+	mod_delayed_work(system_power_efficient_wq, &phydev->state_queue,
+			 secs * HZ);
+}
+
 /**
  * phy_start_machine - start PHY state machine tracking
  * @phydev: the phy_device struct
@@ -647,7 +654,7 @@ EXPORT_SYMBOL_GPL(phy_speed_up);
  */
 void phy_start_machine(struct phy_device *phydev)
 {
-	queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, HZ);
+	phy_queue_state_machine(phydev, 1);
 }
 EXPORT_SYMBOL_GPL(phy_start_machine);
 
@@ -655,19 +662,14 @@ EXPORT_SYMBOL_GPL(phy_start_machine);
  * phy_trigger_machine - trigger the state machine to run
  *
  * @phydev: the phy_device struct
- * @sync: indicate whether we should wait for the workqueue cancelation
  *
  * Description: There has been a change in state which requires that the
  *   state machine runs.
  */
 
-void phy_trigger_machine(struct phy_device *phydev, bool sync)
+void phy_trigger_machine(struct phy_device *phydev)
 {
-	if (sync)
-		cancel_delayed_work_sync(&phydev->state_queue);
-	else
-		cancel_delayed_work(&phydev->state_queue);
-	queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0);
+	phy_queue_state_machine(phydev, 0);
 }
 
 /**
@@ -703,7 +705,7 @@ static void phy_error(struct phy_device *phydev)
 	phydev->state = PHY_HALTED;
 	mutex_unlock(&phydev->lock);
 
-	phy_trigger_machine(phydev, false);
+	phy_trigger_machine(phydev);
 }
 
 /**
@@ -745,7 +747,7 @@ static irqreturn_t phy_change(struct phy_device *phydev)
 	mutex_unlock(&phydev->lock);
 
 	/* reschedule state queue work to run as soon as possible */
-	phy_trigger_machine(phydev, true);
+	phy_trigger_machine(phydev);
 
 	if (phy_interrupt_is_valid(phydev) && phy_clear_interrupt(phydev))
 		goto phy_err;
@@ -911,7 +913,7 @@ void phy_start(struct phy_device *phydev)
 	}
 	mutex_unlock(&phydev->lock);
 
-	phy_trigger_machine(phydev, true);
+	phy_trigger_machine(phydev);
 }
 EXPORT_SYMBOL(phy_start);
 
@@ -1130,8 +1132,7 @@ void phy_state_machine(struct work_struct *work)
 	 * called from phy_disconnect() synchronously.
 	 */
 	if (phy_polling_mode(phydev) && old_state != PHY_HALTED)
-		queue_delayed_work(system_power_efficient_wq, &phydev->state_queue,
-				   PHY_STATE_TIME * HZ);
+		phy_queue_state_machine(phydev, PHY_STATE_TIME);
 }
 
 /**
diff --git a/include/linux/phy.h b/include/linux/phy.h
index dff51dd36e52..3ea87f774a76 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1054,7 +1054,7 @@ void phy_change_work(struct work_struct *work);
 void phy_mac_interrupt(struct phy_device *phydev);
 void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
-void phy_trigger_machine(struct phy_device *phydev, bool sync);
+void phy_trigger_machine(struct phy_device *phydev);
 int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 void phy_ethtool_ksettings_get(struct phy_device *phydev,
 			       struct ethtool_link_ksettings *cmd);
-- 
cgit v1.2.3


From fb420d5d91c1274d5966917725e71f27ed092a85 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 28 Sep 2018 10:28:44 -0700
Subject: tcp/fq: move back to CLOCK_MONOTONIC

In the recent TCP/EDT patch series, I switched TCP and sch_fq
clocks from MONOTONIC to TAI, in order to meet the choice done
earlier for sch_etf packet scheduler.

But sure enough, this broke some setups were the TAI clock
jumps forward (by almost 50 year...), as reported
by Leonard Crestez.

If we want to converge later, we'll probably need to add
an skb field to differentiate the clock bases, or a socket option.

In the meantime, an UDP application will need to use CLOCK_MONOTONIC
base for its SCM_TXTIME timestamps if using fq packet scheduler.

Fixes: 72b0094f9182 ("tcp: switch tcp_clock_ns() to CLOCK_TAI base")
Fixes: 142537e41923 ("net_sched: sch_fq: switch to CLOCK_TAI")
Fixes: fd2bca2aa789 ("tcp: switch internal pacing timer to CLOCK_TAI")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Leonard Crestez <leonard.crestez@nxp.com>
Tested-by: Leonard Crestez <leonard.crestez@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 2 +-
 net/ipv4/tcp_timer.c | 2 +-
 net/sched/sch_fq.c   | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index ff15d8e0d525..0d2929223c70 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -732,7 +732,7 @@ void tcp_send_window_probe(struct sock *sk);
 
 static inline u64 tcp_clock_ns(void)
 {
-	return ktime_get_tai_ns();
+	return ktime_get_ns();
 }
 
 static inline u64 tcp_clock_us(void)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 4f661e178da8..61023d50cd60 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -758,7 +758,7 @@ void tcp_init_xmit_timers(struct sock *sk)
 {
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
 				  &tcp_keepalive_timer);
-	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_TAI,
+	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_ABS_PINNED_SOFT);
 	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 628a2cdcfc6f..338222a6c664 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -412,7 +412,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
-	u64 now = ktime_get_tai_ns();
+	u64 now = ktime_get_ns();
 	struct fq_flow_head *head;
 	struct sk_buff *skb;
 	struct fq_flow *f;
@@ -776,7 +776,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 	q->fq_trees_log		= ilog2(1024);
 	q->orphan_mask		= 1024 - 1;
 	q->low_rate_threshold	= 550000 / 8;
-	qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_TAI);
+	qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
 
 	if (opt)
 		err = fq_change(sch, opt, extack);
@@ -831,7 +831,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.flows_plimit		  = q->stat_flows_plimit;
 	st.pkts_too_long	  = q->stat_pkts_too_long;
 	st.allocation_errors	  = q->stat_allocation_errors;
-	st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_tai_ns();
+	st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
 	st.flows		  = q->flows;
 	st.inactive_flows	  = q->inactive_flows;
 	st.throttled_flows	  = q->throttled_flows;
-- 
cgit v1.2.3


From 81e54d08d9d845053111f30045a93f3eb1c3ca96 Mon Sep 17 00:00:00 2001
From: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Date: Thu, 20 Sep 2018 17:30:09 -0700
Subject: cfg80211: support FTM responder configuration/statistics

Allow userspace to enable fine timing measurement responder
functionality with configurable lci/civic parameters in AP mode.
This can be done at AP start or changing beacon parameters.

A new EXT_FEATURE flag is introduced for drivers to advertise
the capability.

Also nl80211 API support for retrieving statistics is added.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
[remove unused cfg80211_ftm_responder_params, clarify docs,
 move validation into policy]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  52 +++++++++++++++++
 include/uapi/linux/nl80211.h |  90 +++++++++++++++++++++++++++++
 net/wireless/nl80211.c       | 132 +++++++++++++++++++++++++++++++++++++++++--
 net/wireless/rdev-ops.h      |  15 +++++
 net/wireless/trace.h         |  44 +++++++++++++++
 5 files changed, 328 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9f3ed79c39d7..deb313105014 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -775,6 +775,12 @@ struct cfg80211_crypto_settings {
  * @assocresp_ies_len: length of assocresp_ies in octets
  * @probe_resp_len: length of probe response template (@probe_resp)
  * @probe_resp: probe response template (AP mode only)
+ * @ftm_responder: enable FTM responder functionality; -1 for no change
+ *	(which also implies no change in LCI/civic location data)
+ * @lci: LCI subelement content
+ * @civicloc: Civic location subelement content
+ * @lci_len: LCI data length
+ * @civicloc_len: Civic location data length
  */
 struct cfg80211_beacon_data {
 	const u8 *head, *tail;
@@ -782,12 +788,17 @@ struct cfg80211_beacon_data {
 	const u8 *proberesp_ies;
 	const u8 *assocresp_ies;
 	const u8 *probe_resp;
+	const u8 *lci;
+	const u8 *civicloc;
+	s8 ftm_responder;
 
 	size_t head_len, tail_len;
 	size_t beacon_ies_len;
 	size_t proberesp_ies_len;
 	size_t assocresp_ies_len;
 	size_t probe_resp_len;
+	size_t lci_len;
+	size_t civicloc_len;
 };
 
 struct mac_address {
@@ -2796,6 +2807,40 @@ struct cfg80211_external_auth_params {
 	u16 status;
 };
 
+/**
+ * cfg80211_ftm_responder_stats - FTM responder statistics
+ *
+ * @filled: bitflag of flags using the bits of &enum nl80211_ftm_stats to
+ *	indicate the relevant values in this struct for them
+ * @success_num: number of FTM sessions in which all frames were successfully
+ *	answered
+ * @partial_num: number of FTM sessions in which part of frames were
+ *	successfully answered
+ * @failed_num: number of failed FTM sessions
+ * @asap_num: number of ASAP FTM sessions
+ * @non_asap_num: number of  non-ASAP FTM sessions
+ * @total_duration_ms: total sessions durations - gives an indication
+ *	of how much time the responder was busy
+ * @unknown_triggers_num: number of unknown FTM triggers - triggers from
+ *	initiators that didn't finish successfully the negotiation phase with
+ *	the responder
+ * @reschedule_requests_num: number of FTM reschedule requests - initiator asks
+ *	for a new scheduling although it already has scheduled FTM slot
+ * @out_of_window_triggers_num: total FTM triggers out of scheduled window
+ */
+struct cfg80211_ftm_responder_stats {
+	u32 filled;
+	u32 success_num;
+	u32 partial_num;
+	u32 failed_num;
+	u32 asap_num;
+	u32 non_asap_num;
+	u64 total_duration_ms;
+	u32 unknown_triggers_num;
+	u32 reschedule_requests_num;
+	u32 out_of_window_triggers_num;
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -3128,6 +3173,9 @@ struct cfg80211_external_auth_params {
  *
  * @tx_control_port: TX a control port frame (EAPoL).  The noencrypt parameter
  *	tells the driver that the frame should not be encrypted.
+ *
+ * @get_ftm_responder_stats: Retrieve FTM responder statistics, if available.
+ *	Statistics should be cumulative, currently no way to reset is provided.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -3433,6 +3481,10 @@ struct cfg80211_ops {
 				   const u8 *buf, size_t len,
 				   const u8 *dest, const __be16 proto,
 				   const bool noencrypt);
+
+	int	(*get_ftm_responder_stats)(struct wiphy *wiphy,
+				struct net_device *dev,
+				struct cfg80211_ftm_responder_stats *ftm_stats);
 };
 
 /*
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index cfc94178d608..dc6d5a1ef470 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1033,6 +1033,9 @@
  *	%NL80211_ATTR_CHANNEL_WIDTH,%NL80211_ATTR_NSS attributes with its
  *	address(specified in %NL80211_ATTR_MAC).
  *
+ * @NL80211_CMD_GET_FTM_RESPONDER_STATS: Retrieve FTM responder statistics, in
+ *	the %NL80211_ATTR_FTM_RESPONDER_STATS attribute.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1245,6 +1248,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_CONTROL_PORT_FRAME,
 
+	NL80211_CMD_GET_FTM_RESPONDER_STATS,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2241,6 +2246,14 @@ enum nl80211_commands {
  *	association request when used with NL80211_CMD_NEW_STATION). Can be set
  *	only if %NL80211_STA_FLAG_WME is set.
  *
+ * @NL80211_ATTR_FTM_RESPONDER: nested attribute which user-space can include
+ *	in %NL80211_CMD_START_AP or %NL80211_CMD_SET_BEACON for fine timing
+ *	measurement (FTM) responder functionality and containing parameters as
+ *	possible, see &enum nl80211_ftm_responder_attr
+ *
+ * @NL80211_ATTR_FTM_RESPONDER_STATS: Nested attribute with FTM responder
+ *	statistics, see &enum nl80211_ftm_responder_stats.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2682,6 +2695,10 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_HE_CAPABILITY,
 
+	NL80211_ATTR_FTM_RESPONDER,
+
+	NL80211_ATTR_FTM_RESPONDER_STATS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -5225,6 +5242,8 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT: Driver/device can omit all data
  *	except for supported rates from the probe request content if requested
  *	by the %NL80211_SCAN_FLAG_MIN_PREQ_CONTENT flag.
+ * @NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER: Driver supports enabling fine
+ *	timing measurement responder role.
  *
  * @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0: Driver/device confirm that they are
  *      able to rekey an in-use key correctly. Userspace must not rekey PTK keys
@@ -5269,6 +5288,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_SCAN_RANDOM_SN,
 	NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT,
 	NL80211_EXT_FEATURE_CAN_REPLACE_PTK0,
+	NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
@@ -5808,4 +5828,74 @@ enum nl80211_external_auth_action {
 	NL80211_EXTERNAL_AUTH_ABORT,
 };
 
+/**
+ * enum nl80211_ftm_responder_attributes - fine timing measurement
+ *	responder attributes
+ * @__NL80211_FTM_RESP_ATTR_INVALID: Invalid
+ * @NL80211_FTM_RESP_ATTR_ENABLED: FTM responder is enabled
+ * @NL80211_FTM_RESP_ATTR_LCI: The content of Measurement Report Element
+ *	(9.4.2.22 in 802.11-2016) with type 8 - LCI (9.4.2.22.10)
+ * @NL80211_FTM_RESP_ATTR_CIVIC: The content of Measurement Report Element
+ *	(9.4.2.22 in 802.11-2016) with type 11 - Civic (Section 9.4.2.22.13)
+ * @__NL80211_FTM_RESP_ATTR_LAST: Internal
+ * @NL80211_FTM_RESP_ATTR_MAX: highest FTM responder attribute.
+ */
+enum nl80211_ftm_responder_attributes {
+	__NL80211_FTM_RESP_ATTR_INVALID,
+
+	NL80211_FTM_RESP_ATTR_ENABLED,
+	NL80211_FTM_RESP_ATTR_LCI,
+	NL80211_FTM_RESP_ATTR_CIVICLOC,
+
+	/* keep last */
+	__NL80211_FTM_RESP_ATTR_LAST,
+	NL80211_FTM_RESP_ATTR_MAX = __NL80211_FTM_RESP_ATTR_LAST - 1,
+};
+
+/*
+ * enum nl80211_ftm_responder_stats - FTM responder statistics
+ *
+ * These attribute types are used with %NL80211_ATTR_FTM_RESPONDER_STATS
+ * when getting FTM responder statistics.
+ *
+ * @__NL80211_FTM_STATS_INVALID: attribute number 0 is reserved
+ * @NL80211_FTM_STATS_SUCCESS_NUM: number of FTM sessions in which all frames
+ *	were ssfully answered (u32)
+ * @NL80211_FTM_STATS_PARTIAL_NUM: number of FTM sessions in which part of the
+ *	frames were successfully answered (u32)
+ * @NL80211_FTM_STATS_FAILED_NUM: number of failed FTM sessions (u32)
+ * @NL80211_FTM_STATS_ASAP_NUM: number of ASAP sessions (u32)
+ * @NL80211_FTM_STATS_NON_ASAP_NUM: number of non-ASAP sessions (u32)
+ * @NL80211_FTM_STATS_TOTAL_DURATION_MSEC: total sessions durations - gives an
+ *	indication of how much time the responder was busy (u64, msec)
+ * @NL80211_FTM_STATS_UNKNOWN_TRIGGERS_NUM: number of unknown FTM triggers -
+ *	triggers from initiators that didn't finish successfully the negotiation
+ *	phase with the responder (u32)
+ * @NL80211_FTM_STATS_RESCHEDULE_REQUESTS_NUM: number of FTM reschedule requests
+ *	- initiator asks for a new scheduling although it already has scheduled
+ *	FTM slot (u32)
+ * @NL80211_FTM_STATS_OUT_OF_WINDOW_TRIGGERS_NUM: number of FTM triggers out of
+ *	scheduled window (u32)
+ * @NL80211_FTM_STATS_PAD: used for padding, ignore
+ * @__NL80211_TXQ_ATTR_AFTER_LAST: Internal
+ * @NL80211_FTM_STATS_MAX: highest possible FTM responder stats attribute
+ */
+enum nl80211_ftm_responder_stats {
+	__NL80211_FTM_STATS_INVALID,
+	NL80211_FTM_STATS_SUCCESS_NUM,
+	NL80211_FTM_STATS_PARTIAL_NUM,
+	NL80211_FTM_STATS_FAILED_NUM,
+	NL80211_FTM_STATS_ASAP_NUM,
+	NL80211_FTM_STATS_NON_ASAP_NUM,
+	NL80211_FTM_STATS_TOTAL_DURATION_MSEC,
+	NL80211_FTM_STATS_UNKNOWN_TRIGGERS_NUM,
+	NL80211_FTM_STATS_RESCHEDULE_REQUESTS_NUM,
+	NL80211_FTM_STATS_OUT_OF_WINDOW_TRIGGERS_NUM,
+	NL80211_FTM_STATS_PAD,
+
+	/* keep last */
+	__NL80211_FTM_STATS_AFTER_LAST,
+	NL80211_FTM_STATS_MAX = __NL80211_FTM_STATS_AFTER_LAST - 1
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 90788ebe794e..235a43185e8d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -201,6 +201,15 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info)
 }
 
 /* policy for the attributes */
+static const struct nla_policy
+nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = {
+	[NL80211_FTM_RESP_ATTR_ENABLED] = { .type = NLA_FLAG, },
+	[NL80211_FTM_RESP_ATTR_LCI] = { .type = NLA_BINARY,
+					.len = U8_MAX },
+	[NL80211_FTM_RESP_ATTR_CIVICLOC] = { .type = NLA_BINARY,
+					     .len = U8_MAX },
+};
+
 static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
 	[NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
@@ -430,6 +439,11 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
 	[NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY,
 					 .len = NL80211_HE_MAX_CAPABILITY_LEN },
+
+	[NL80211_ATTR_FTM_RESPONDER] = {
+		.type = NLA_NESTED,
+		.validation_data = nl80211_ftm_responder_policy,
+	},
 };
 
 /* policy for the key attributes */
@@ -3989,10 +4003,12 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
 	return 0;
 }
 
-static int nl80211_parse_beacon(struct nlattr *attrs[],
+static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
+				struct nlattr *attrs[],
 				struct cfg80211_beacon_data *bcn)
 {
 	bool haveinfo = false;
+	int err;
 
 	if (!is_valid_ie_attr(attrs[NL80211_ATTR_BEACON_TAIL]) ||
 	    !is_valid_ie_attr(attrs[NL80211_ATTR_IE]) ||
@@ -4043,6 +4059,35 @@ static int nl80211_parse_beacon(struct nlattr *attrs[],
 		bcn->probe_resp_len = nla_len(attrs[NL80211_ATTR_PROBE_RESP]);
 	}
 
+	if (attrs[NL80211_ATTR_FTM_RESPONDER]) {
+		struct nlattr *tb[NL80211_FTM_RESP_ATTR_MAX + 1];
+
+		err = nla_parse_nested(tb, NL80211_FTM_RESP_ATTR_MAX,
+				       attrs[NL80211_ATTR_FTM_RESPONDER],
+				       NULL, NULL);
+		if (err)
+			return err;
+
+		if (tb[NL80211_FTM_RESP_ATTR_ENABLED] &&
+		    wiphy_ext_feature_isset(&rdev->wiphy,
+					    NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER))
+			bcn->ftm_responder = 1;
+		else
+			return -EOPNOTSUPP;
+
+		if (tb[NL80211_FTM_RESP_ATTR_LCI]) {
+			bcn->lci = nla_data(tb[NL80211_FTM_RESP_ATTR_LCI]);
+			bcn->lci_len = nla_len(tb[NL80211_FTM_RESP_ATTR_LCI]);
+		}
+
+		if (tb[NL80211_FTM_RESP_ATTR_CIVICLOC]) {
+			bcn->civicloc = nla_data(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]);
+			bcn->civicloc_len = nla_len(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]);
+		}
+	} else {
+		bcn->ftm_responder = -1;
+	}
+
 	return 0;
 }
 
@@ -4189,7 +4234,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	    !info->attrs[NL80211_ATTR_BEACON_HEAD])
 		return -EINVAL;
 
-	err = nl80211_parse_beacon(info->attrs, &params.beacon);
+	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon);
 	if (err)
 		return err;
 
@@ -4373,7 +4418,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 	if (!wdev->beacon_interval)
 		return -EINVAL;
 
-	err = nl80211_parse_beacon(info->attrs, &params);
+	err = nl80211_parse_beacon(rdev, info->attrs, &params);
 	if (err)
 		return err;
 
@@ -7935,7 +7980,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 	if (!need_new_beacon)
 		goto skip_beacons;
 
-	err = nl80211_parse_beacon(info->attrs, &params.beacon_after);
+	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_after);
 	if (err)
 		return err;
 
@@ -7945,7 +7990,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		return err;
 
-	err = nl80211_parse_beacon(csa_attrs, &params.beacon_csa);
+	err = nl80211_parse_beacon(rdev, csa_attrs, &params.beacon_csa);
 	if (err)
 		return err;
 
@@ -12984,6 +13029,76 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
 	return err;
 }
 
+static int nl80211_get_ftm_responder_stats(struct sk_buff *skb,
+					   struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_ftm_responder_stats ftm_stats = {};
+	struct sk_buff *msg;
+	void *hdr;
+	struct nlattr *ftm_stats_attr;
+	int err;
+
+	if (wdev->iftype != NL80211_IFTYPE_AP || !wdev->beacon_interval)
+		return -EOPNOTSUPP;
+
+	err = rdev_get_ftm_responder_stats(rdev, dev, &ftm_stats);
+	if (err)
+		return err;
+
+	if (!ftm_stats.filled)
+		return -ENODATA;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+			     NL80211_CMD_GET_FTM_RESPONDER_STATS);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
+		goto nla_put_failure;
+
+	ftm_stats_attr = nla_nest_start(msg, NL80211_ATTR_FTM_RESPONDER_STATS);
+	if (!ftm_stats_attr)
+		goto nla_put_failure;
+
+#define SET_FTM(field, name, type)					 \
+	do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \
+	    nla_put_ ## type(msg, NL80211_FTM_STATS_ ## name,		 \
+			     ftm_stats.field))				 \
+		goto nla_put_failure; } while (0)
+#define SET_FTM_U64(field, name)					 \
+	do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \
+	    nla_put_u64_64bit(msg, NL80211_FTM_STATS_ ## name,		 \
+			      ftm_stats.field, NL80211_FTM_STATS_PAD))	 \
+		goto nla_put_failure; } while (0)
+
+	SET_FTM(success_num, SUCCESS_NUM, u32);
+	SET_FTM(partial_num, PARTIAL_NUM, u32);
+	SET_FTM(failed_num, FAILED_NUM, u32);
+	SET_FTM(asap_num, ASAP_NUM, u32);
+	SET_FTM(non_asap_num, NON_ASAP_NUM, u32);
+	SET_FTM_U64(total_duration_ms, TOTAL_DURATION_MSEC);
+	SET_FTM(unknown_triggers_num, UNKNOWN_TRIGGERS_NUM, u32);
+	SET_FTM(reschedule_requests_num, RESCHEDULE_REQUESTS_NUM, u32);
+	SET_FTM(out_of_window_triggers_num, OUT_OF_WINDOW_TRIGGERS_NUM, u32);
+#undef SET_FTM
+
+	nla_nest_end(msg, ftm_stats_attr);
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -13895,6 +14010,13 @@ static const struct genl_ops nl80211_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS,
+		.doit = nl80211_get_ftm_responder_stats,
+		.policy = nl80211_policy,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 364f5d67f05b..51380b5c32f2 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1232,4 +1232,19 @@ rdev_external_auth(struct cfg80211_registered_device *rdev,
 	return ret;
 }
 
+static inline int
+rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev,
+			     struct net_device *dev,
+			     struct cfg80211_ftm_responder_stats *ftm_stats)
+{
+	int ret = -EOPNOTSUPP;
+
+	trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats);
+	if (rdev->ops->get_ftm_responder_stats)
+		ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev,
+							ftm_stats);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index e51348e24ff5..7e0380192445 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3250,6 +3250,50 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_get_txq_stats,
 	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
 	TP_ARGS(wiphy, wdev)
 );
+
+TRACE_EVENT(rdev_get_ftm_responder_stats,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct cfg80211_ftm_responder_stats *ftm_stats),
+
+	TP_ARGS(wiphy, netdev, ftm_stats),
+
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		__field(u64, timestamp)
+		__field(u32, success_num)
+		__field(u32, partial_num)
+		__field(u32, failed_num)
+		__field(u32, asap_num)
+		__field(u32, non_asap_num)
+		__field(u64, duration)
+		__field(u32, unknown_triggers)
+		__field(u32, reschedule)
+		__field(u32, out_of_window)
+	),
+
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		__entry->success_num = ftm_stats->success_num;
+		__entry->partial_num = ftm_stats->partial_num;
+		__entry->failed_num = ftm_stats->failed_num;
+		__entry->asap_num = ftm_stats->asap_num;
+		__entry->non_asap_num = ftm_stats->non_asap_num;
+		__entry->duration = ftm_stats->total_duration_ms;
+		__entry->unknown_triggers = ftm_stats->unknown_triggers_num;
+		__entry->reschedule = ftm_stats->reschedule_requests_num;
+		__entry->out_of_window = ftm_stats->out_of_window_triggers_num;
+	),
+
+	TP_printk(WIPHY_PR_FMT "Ftm responder stats: success %u, partial %u, "
+		"failed %u, asap %u, non asap %u, total duration %llu, unknown "
+		"triggers %u, rescheduled %u, out of window %u", WIPHY_PR_ARG,
+		__entry->success_num, __entry->partial_num, __entry->failed_num,
+		__entry->asap_num, __entry->non_asap_num, __entry->duration,
+		__entry->unknown_triggers, __entry->reschedule,
+		__entry->out_of_window)
+);
 #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From b60ad3485106b5845113e7a2745abb7e64b15d6d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 1 Oct 2018 11:52:07 +0200
Subject: cfg80211: move cookie_counter out of wiphy

There's no reason for drivers to be able to access the
cfg80211 internal cookie counter; move it out of the
wiphy into the rdev structure.

While at it, also make it never assign 0 as a cookie
(we consider that invalid in some places), and warn if
we manage to do that for some reason (wrapping is not
likely to happen with a u64.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  3 ---
 net/wireless/core.h    | 11 +++++++++++
 net/wireless/nl80211.c |  4 ++--
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index deb313105014..8f5ee2c2da04 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4012,7 +4012,6 @@ struct wiphy_iftype_ext_capab {
  *	by the driver in the .connect() callback. The bit position maps to the
  *	attribute indices defined in &enum nl80211_bss_select_attr.
  *
- * @cookie_counter: unique generic cookie counter, used to identify objects.
  * @nan_supported_bands: bands supported by the device in NAN mode, a
  *	bitmap of &enum nl80211_band values.  For instance, for
  *	NL80211_BAND_2GHZ, bit 0 would be set
@@ -4151,8 +4150,6 @@ struct wiphy {
 
 	u32 bss_select_support;
 
-	u64 cookie_counter;
-
 	u8 nan_supported_bands;
 
 	u32 txq_limit;
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 45fd4e21dbda..c61dbba8bf47 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -66,6 +66,7 @@ struct cfg80211_registered_device {
 	/* protected by RTNL only */
 	int num_running_ifaces;
 	int num_running_monitor_ifaces;
+	u64 cookie_counter;
 
 	/* BSSes/scanning */
 	spinlock_t bss_lock;
@@ -133,6 +134,16 @@ cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev)
 #endif
 }
 
+static inline u64 cfg80211_assign_cookie(struct cfg80211_registered_device *rdev)
+{
+	u64 r = ++rdev->cookie_counter;
+
+	if (WARN_ON(r == 0))
+		r = ++rdev->cookie_counter;
+
+	return r;
+}
+
 extern struct workqueue_struct *cfg80211_wq;
 extern struct list_head cfg80211_rdev_list;
 extern int cfg80211_rdev_list_generation;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8f962c79987a..60ce2eb57fbb 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7803,7 +7803,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
 	 */
 	if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) {
 		while (!sched_scan_req->reqid)
-			sched_scan_req->reqid = rdev->wiphy.cookie_counter++;
+			sched_scan_req->reqid = cfg80211_assign_cookie(rdev);
 	}
 
 	err = rdev_sched_scan_start(rdev, dev, sched_scan_req);
@@ -11798,7 +11798,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
 	if (!func)
 		return -ENOMEM;
 
-	func->cookie = wdev->wiphy->cookie_counter++;
+	func->cookie = cfg80211_assign_cookie(rdev);
 
 	if (!tb[NL80211_NAN_FUNC_TYPE] ||
 	    nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) {
-- 
cgit v1.2.3


From cc16567e5a8a7bb9439ef61ab80069acdd33f76f Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 2 Oct 2018 11:03:40 +0200
Subject: net: drop unused skb_append_datato_frags()

This helper is unused since commit 988cf74deb45 ("inet:
Stop generating UFO packets.")

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  5 -----
 net/core/skbuff.c      | 58 --------------------------------------------------
 2 files changed, 63 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 87e29710373f..119d092c6b13 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1082,11 +1082,6 @@ static inline int skb_pad(struct sk_buff *skb, int pad)
 }
 #define dev_kfree_skb(a)	consume_skb(a)
 
-int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
-			    int getfrag(void *from, char *to, int offset,
-					int len, int odd, struct sk_buff *skb),
-			    void *from, int length);
-
 int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
 			 int offset, size_t size);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b2c807f67aba..0e937d3d85b5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3381,64 +3381,6 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
 }
 EXPORT_SYMBOL(skb_find_text);
 
-/**
- * skb_append_datato_frags - append the user data to a skb
- * @sk: sock  structure
- * @skb: skb structure to be appended with user data.
- * @getfrag: call back function to be used for getting the user data
- * @from: pointer to user message iov
- * @length: length of the iov message
- *
- * Description: This procedure append the user data in the fragment part
- * of the skb if any page alloc fails user this procedure returns  -ENOMEM
- */
-int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
-			int (*getfrag)(void *from, char *to, int offset,
-					int len, int odd, struct sk_buff *skb),
-			void *from, int length)
-{
-	int frg_cnt = skb_shinfo(skb)->nr_frags;
-	int copy;
-	int offset = 0;
-	int ret;
-	struct page_frag *pfrag = &current->task_frag;
-
-	do {
-		/* Return error if we don't have space for new frag */
-		if (frg_cnt >= MAX_SKB_FRAGS)
-			return -EMSGSIZE;
-
-		if (!sk_page_frag_refill(sk, pfrag))
-			return -ENOMEM;
-
-		/* copy the user data to page */
-		copy = min_t(int, length, pfrag->size - pfrag->offset);
-
-		ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,
-			      offset, copy, 0, skb);
-		if (ret < 0)
-			return -EFAULT;
-
-		/* copy was successful so update the size parameters */
-		skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
-				   copy);
-		frg_cnt++;
-		pfrag->offset += copy;
-		get_page(pfrag->page);
-
-		skb->truesize += copy;
-		refcount_add(copy, &sk->sk_wmem_alloc);
-		skb->len += copy;
-		skb->data_len += copy;
-		offset += copy;
-		length -= copy;
-
-	} while (length > 0);
-
-	return 0;
-}
-EXPORT_SYMBOL(skb_append_datato_frags);
-
 int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
 			 int offset, size_t size)
 {
-- 
cgit v1.2.3


From 5bf0961cc6a180c077793f2615a8fd842c655876 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 2 Oct 2018 06:16:11 -0700
Subject: qed: Add driver support for 20G link speed.

Add driver support for configuring/reading the 20G link speed.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c  |  3 +++
 drivers/net/ethernet/qlogic/qed/qed_hsi.h  |  2 ++
 drivers/net/ethernet/qlogic/qed/qed_main.c | 11 +++++++++++
 include/linux/qed/qed_if.h                 |  9 +++++----
 4 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 0fbeafeef7a0..7ceb2b97538d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -2679,6 +2679,9 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	case NVM_CFG1_PORT_DRV_LINK_SPEED_10G:
 		link->speed.forced_speed = 10000;
 		break;
+	case NVM_CFG1_PORT_DRV_LINK_SPEED_20G:
+		link->speed.forced_speed = 20000;
+		break;
 	case NVM_CFG1_PORT_DRV_LINK_SPEED_25G:
 		link->speed.forced_speed = 25000;
 		break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index d4d08383c753..56578f888b70 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -13154,6 +13154,7 @@ struct nvm_cfg1_port {
 #define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_OFFSET		0
 #define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G		0x1
 #define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G		0x2
+#define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_20G             0x4
 #define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G		0x8
 #define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G		0x10
 #define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G		0x20
@@ -13164,6 +13165,7 @@ struct nvm_cfg1_port {
 #define NVM_CFG1_PORT_DRV_LINK_SPEED_AUTONEG			0x0
 #define NVM_CFG1_PORT_DRV_LINK_SPEED_1G				0x1
 #define NVM_CFG1_PORT_DRV_LINK_SPEED_10G			0x2
+#define NVM_CFG1_PORT_DRV_LINK_SPEED_20G                        0x3
 #define NVM_CFG1_PORT_DRV_LINK_SPEED_25G			0x4
 #define NVM_CFG1_PORT_DRV_LINK_SPEED_40G			0x5
 #define NVM_CFG1_PORT_DRV_LINK_SPEED_50G			0x6
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 2094d86a7a08..75d217aaf8ce 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1337,6 +1337,9 @@ static int qed_set_link(struct qed_dev *cdev, struct qed_link_params *params)
 		if (params->adv_speeds & QED_LM_10000baseKR_Full_BIT)
 			link_params->speed.advertised_speeds |=
 			    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G;
+		if (params->adv_speeds & QED_LM_20000baseKR2_Full_BIT)
+			link_params->speed.advertised_speeds |=
+				NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_20G;
 		if (params->adv_speeds & QED_LM_25000baseKR_Full_BIT)
 			link_params->speed.advertised_speeds |=
 			    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G;
@@ -1502,6 +1505,9 @@ static void qed_fill_link(struct qed_hwfn *hwfn,
 	if (params.speed.advertised_speeds &
 	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G)
 		if_link->advertised_caps |= QED_LM_10000baseKR_Full_BIT;
+	if (params.speed.advertised_speeds &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_20G)
+		if_link->advertised_caps |= QED_LM_20000baseKR2_Full_BIT;
 	if (params.speed.advertised_speeds &
 	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G)
 		if_link->advertised_caps |= QED_LM_25000baseKR_Full_BIT;
@@ -1522,6 +1528,9 @@ static void qed_fill_link(struct qed_hwfn *hwfn,
 	if (link_caps.speed_capabilities &
 	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G)
 		if_link->supported_caps |= QED_LM_10000baseKR_Full_BIT;
+	if (link_caps.speed_capabilities &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_20G)
+		if_link->supported_caps |= QED_LM_20000baseKR2_Full_BIT;
 	if (link_caps.speed_capabilities &
 	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G)
 		if_link->supported_caps |= QED_LM_25000baseKR_Full_BIT;
@@ -1559,6 +1568,8 @@ static void qed_fill_link(struct qed_hwfn *hwfn,
 		if_link->lp_caps |= QED_LM_1000baseT_Full_BIT;
 	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_10G)
 		if_link->lp_caps |= QED_LM_10000baseKR_Full_BIT;
+	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_20G)
+		if_link->lp_caps |= QED_LM_20000baseKR2_Full_BIT;
 	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_25G)
 		if_link->lp_caps |= QED_LM_25000baseKR_Full_BIT;
 	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_40G)
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 8cd34645e892..dee3c9c744f7 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -670,10 +670,11 @@ enum qed_link_mode_bits {
 	QED_LM_1000baseT_Half_BIT = BIT(4),
 	QED_LM_1000baseT_Full_BIT = BIT(5),
 	QED_LM_10000baseKR_Full_BIT = BIT(6),
-	QED_LM_25000baseKR_Full_BIT = BIT(7),
-	QED_LM_40000baseLR4_Full_BIT = BIT(8),
-	QED_LM_50000baseKR2_Full_BIT = BIT(9),
-	QED_LM_100000baseKR4_Full_BIT = BIT(10),
+	QED_LM_20000baseKR2_Full_BIT = BIT(7),
+	QED_LM_25000baseKR_Full_BIT = BIT(8),
+	QED_LM_40000baseLR4_Full_BIT = BIT(9),
+	QED_LM_50000baseKR2_Full_BIT = BIT(10),
+	QED_LM_100000baseKR4_Full_BIT = BIT(11),
 	QED_LM_COUNT = 11
 };
 
-- 
cgit v1.2.3


From d456336d164886d9339aaa112d6595e1c142f8bc Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Sat, 29 Sep 2018 23:44:50 -0700
Subject: net: remove 1 always zero parameter from ip6_redirect_no_header()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(the parameter in question is mark)

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 3 +--
 net/ipv6/ndisc.c        | 2 +-
 net/ipv6/route.c        | 4 +---
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 7b9c82de11cc..cef186dbd2ce 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -165,8 +165,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
 		  kuid_t uid);
-void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
-			    u32 mark);
+void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif);
 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);
 
 struct netlink_callback;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0ec273997d1d..51863ada15a4 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1533,7 +1533,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 
 	if (!ndopts.nd_opts_rh) {
 		ip6_redirect_no_header(skb, dev_net(skb->dev),
-					skb->dev->ifindex, 0);
+					skb->dev->ifindex);
 		return;
 	}
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index dff80697c033..e50525a95a09 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2524,8 +2524,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
 }
 EXPORT_SYMBOL_GPL(ip6_redirect);
 
-void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
-			    u32 mark)
+void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
 {
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
@@ -2533,7 +2532,6 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
 	struct flowi6 fl6 = {
 		.flowi6_iif = LOOPBACK_IFINDEX,
 		.flowi6_oif = oif,
-		.flowi6_mark = mark,
 		.daddr = msg->dest,
 		.saddr = iph->daddr,
 		.flowi6_uid = sock_net_uid(net, NULL),
-- 
cgit v1.2.3


From f3709f69b7c5cba6323cc03c29b64293b93be817 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:29 -0700
Subject: bpf: Add iterator for spilled registers

Add this iterator for spilled registers, it concentrates the details of
how to get the current frame's spilled registers into a single macro
while clarifying the intention of the code which is calling the macro.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 11 +++++++++++
 kernel/bpf/verifier.c        | 16 +++++++---------
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b42b60a83e19..d0e7f97e8b60 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -131,6 +131,17 @@ struct bpf_verifier_state {
 	u32 curframe;
 };
 
+#define bpf_get_spilled_reg(slot, frame)				\
+	(((slot < frame->allocated_stack / BPF_REG_SIZE) &&		\
+	  (frame->stack[slot].slot_type[0] == STACK_SPILL))		\
+	 ? &frame->stack[slot].spilled_ptr : NULL)
+
+/* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
+#define bpf_for_each_spilled_reg(iter, frame, reg)			\
+	for (iter = 0, reg = bpf_get_spilled_reg(iter, frame);		\
+	     iter < frame->allocated_stack / BPF_REG_SIZE;		\
+	     iter++, reg = bpf_get_spilled_reg(iter, frame))
+
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8cc83a970d1..9c82d8f58085 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2252,10 +2252,9 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
 		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(env, regs, i);
 
-	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-		if (state->stack[i].slot_type[0] != STACK_SPILL)
+	bpf_for_each_spilled_reg(i, state, reg) {
+		if (!reg)
 			continue;
-		reg = &state->stack[i].spilled_ptr;
 		if (reg_is_pkt_pointer_any(reg))
 			__mark_reg_unknown(reg);
 	}
@@ -3395,10 +3394,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 
 	for (j = 0; j <= vstate->curframe; j++) {
 		state = vstate->frame[j];
-		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-			if (state->stack[i].slot_type[0] != STACK_SPILL)
+		bpf_for_each_spilled_reg(i, state, reg) {
+			if (!reg)
 				continue;
-			reg = &state->stack[i].spilled_ptr;
 			if (reg->type == type && reg->id == dst_reg->id)
 				reg->range = max(reg->range, new_range);
 		}
@@ -3643,7 +3641,7 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
 			  bool is_null)
 {
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *reg, *regs = state->regs;
 	u32 id = regs[regno].id;
 	int i, j;
 
@@ -3652,8 +3650,8 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
 
 	for (j = 0; j <= vstate->curframe; j++) {
 		state = vstate->frame[j];
-		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-			if (state->stack[i].slot_type[0] != STACK_SPILL)
+		bpf_for_each_spilled_reg(i, state, reg) {
+			if (!reg)
 				continue;
 			mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
 		}
-- 
cgit v1.2.3


From c64b7983288e636356f7f5f652de4813e1cfedac Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:33 -0700
Subject: bpf: Add PTR_TO_SOCKET verifier type

Teach the verifier a little bit about a new type of pointer, a
PTR_TO_SOCKET. This pointer type is accessed from BPF through the
'struct bpf_sock' structure.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h          |  34 ++++++++++++
 include/linux/bpf_verifier.h |   2 +
 kernel/bpf/verifier.c        | 120 ++++++++++++++++++++++++++++++++++++++-----
 net/core/filter.c            |  30 ++++++-----
 4 files changed, 160 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 018299a595c8..027697b6a22f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -154,6 +154,7 @@ enum bpf_arg_type {
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
+	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock */
 };
 
 /* type of values returned from helper functions */
@@ -162,6 +163,7 @@ enum bpf_return_type {
 	RET_VOID,			/* function doesn't return anything */
 	RET_PTR_TO_MAP_VALUE,		/* returns a pointer to map elem value */
 	RET_PTR_TO_MAP_VALUE_OR_NULL,	/* returns a pointer to map elem value or NULL */
+	RET_PTR_TO_SOCKET_OR_NULL,	/* returns a pointer to a socket or NULL */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -213,6 +215,8 @@ enum bpf_reg_type {
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 	PTR_TO_FLOW_KEYS,	 /* reg points to bpf_flow_keys */
+	PTR_TO_SOCKET,		 /* reg points to struct bpf_sock */
+	PTR_TO_SOCKET_OR_NULL,	 /* reg points to struct bpf_sock or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -343,6 +347,11 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
 typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
 					unsigned long off, unsigned long len);
+typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type,
+					const struct bpf_insn *src,
+					struct bpf_insn *dst,
+					struct bpf_prog *prog,
+					u32 *target_size);
 
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
@@ -836,4 +845,29 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+#if defined(CONFIG_NET)
+bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+			      struct bpf_insn_access_aux *info);
+u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+				const struct bpf_insn *si,
+				struct bpf_insn *insn_buf,
+				struct bpf_prog *prog,
+				u32 *target_size);
+#else
+static inline bool bpf_sock_is_valid_access(int off, int size,
+					    enum bpf_access_type type,
+					    struct bpf_insn_access_aux *info)
+{
+	return false;
+}
+static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+					      const struct bpf_insn *si,
+					      struct bpf_insn *insn_buf,
+					      struct bpf_prog *prog,
+					      u32 *target_size)
+{
+	return 0;
+}
+#endif
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d0e7f97e8b60..a411363098a5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -58,6 +58,8 @@ struct bpf_reg_state {
 	 * offset, so they can share range knowledge.
 	 * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
 	 * came from, when one is tested for != NULL.
+	 * For PTR_TO_SOCKET this is used to share which pointers retain the
+	 * same reference to the socket, to determine proper reference freeing.
 	 */
 	u32 id;
 	/* For scalar types (SCALAR_VALUE), this represents our knowledge of
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 98b218bd46e8..f86386c9affd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -80,8 +80,8 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * (like pointer plus pointer becomes SCALAR_VALUE type)
  *
  * When verifier sees load or store instructions the type of base register
- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
- * types recognized by check_mem_access() function.
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are
+ * four pointer types recognized by check_mem_access() function.
  *
  * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
  * and the range of [ptr, ptr + map's value_size) is accessible.
@@ -267,6 +267,8 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
 	[PTR_TO_FLOW_KEYS]	= "flow_keys",
+	[PTR_TO_SOCKET]		= "sock",
+	[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
 };
 
 static char slot_type_char[] = {
@@ -973,6 +975,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_PACKET_END:
 	case PTR_TO_FLOW_KEYS:
 	case CONST_PTR_TO_MAP:
+	case PTR_TO_SOCKET:
+	case PTR_TO_SOCKET_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -1341,6 +1345,28 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
 	return 0;
 }
 
+static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,
+			     int size, enum bpf_access_type t)
+{
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = &regs[regno];
+	struct bpf_insn_access_aux info;
+
+	if (reg->smin_value < 0) {
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+			regno);
+		return -EACCES;
+	}
+
+	if (!bpf_sock_is_valid_access(off, size, t, &info)) {
+		verbose(env, "invalid bpf_sock access off=%d size=%d\n",
+			off, size);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
 static bool __is_pointer_value(bool allow_ptr_leaks,
 			       const struct bpf_reg_state *reg)
 {
@@ -1459,6 +1485,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 		 */
 		strict = true;
 		break;
+	case PTR_TO_SOCKET:
+		pointer_desc = "sock ";
+		break;
 	default:
 		break;
 	}
@@ -1726,6 +1755,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_flow_keys_access(env, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
+	} else if (reg->type == PTR_TO_SOCKET) {
+		if (t == BPF_WRITE) {
+			verbose(env, "cannot write into socket\n");
+			return -EACCES;
+		}
+		err = check_sock_access(env, regno, off, size, t);
+		if (!err && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -1948,6 +1985,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		err = check_ctx_reg(env, reg, regno);
 		if (err < 0)
 			return err;
+	} else if (arg_type == ARG_PTR_TO_SOCKET) {
+		expected_type = PTR_TO_SOCKET;
+		if (type != expected_type)
+			goto err_type;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
@@ -2543,6 +2584,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
 		regs[BPF_REG_0].id = ++env->id_gen;
+	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
+		regs[BPF_REG_0].id = ++env->id_gen;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -2680,6 +2725,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	case CONST_PTR_TO_MAP:
 	case PTR_TO_PACKET_END:
+	case PTR_TO_SOCKET:
+	case PTR_TO_SOCKET_OR_NULL:
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
@@ -3627,6 +3674,8 @@ static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
 			} else {
 				reg->type = PTR_TO_MAP_VALUE;
 			}
+		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
+			reg->type = PTR_TO_SOCKET;
 		}
 		/* We don't need id from this point onwards anymore, thus we
 		 * should better reset it, so that state pruning has chances
@@ -4402,6 +4451,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case CONST_PTR_TO_MAP:
 	case PTR_TO_PACKET_END:
 	case PTR_TO_FLOW_KEYS:
+	case PTR_TO_SOCKET:
+	case PTR_TO_SOCKET_OR_NULL:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
@@ -4679,6 +4730,37 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	return 0;
 }
 
+/* Return true if it's OK to have the same insn return a different type. */
+static bool reg_type_mismatch_ok(enum bpf_reg_type type)
+{
+	switch (type) {
+	case PTR_TO_CTX:
+	case PTR_TO_SOCKET:
+	case PTR_TO_SOCKET_OR_NULL:
+		return false;
+	default:
+		return true;
+	}
+}
+
+/* If an instruction was previously used with particular pointer types, then we
+ * need to be careful to avoid cases such as the below, where it may be ok
+ * for one branch accessing the pointer, but not ok for the other branch:
+ *
+ * R1 = sock_ptr
+ * goto X;
+ * ...
+ * R1 = some_other_valid_ptr;
+ * goto X;
+ * ...
+ * R2 = *(u32 *)(R1 + 0);
+ */
+static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
+{
+	return src != prev && (!reg_type_mismatch_ok(src) ||
+			       !reg_type_mismatch_ok(prev));
+}
+
 static int do_check(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *state;
@@ -4811,9 +4893,7 @@ static int do_check(struct bpf_verifier_env *env)
 				 */
 				*prev_src_type = src_reg_type;
 
-			} else if (src_reg_type != *prev_src_type &&
-				   (src_reg_type == PTR_TO_CTX ||
-				    *prev_src_type == PTR_TO_CTX)) {
+			} else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {
 				/* ABuser program is trying to use the same insn
 				 * dst_reg = *(u32*) (src_reg + off)
 				 * with different pointer types:
@@ -4858,9 +4938,7 @@ static int do_check(struct bpf_verifier_env *env)
 
 			if (*prev_dst_type == NOT_INIT) {
 				*prev_dst_type = dst_reg_type;
-			} else if (dst_reg_type != *prev_dst_type &&
-				   (dst_reg_type == PTR_TO_CTX ||
-				    *prev_dst_type == PTR_TO_CTX)) {
+			} else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) {
 				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
@@ -5286,8 +5364,10 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
 	}
 }
 
-/* convert load instructions that access fields of 'struct __sk_buff'
- * into sequence of instructions that access fields of 'struct sk_buff'
+/* convert load instructions that access fields of a context type into a
+ * sequence of instructions that access fields of the underlying structure:
+ *     struct __sk_buff    -> struct sk_buff
+ *     struct bpf_sock_ops -> struct sock
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
@@ -5316,12 +5396,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 	}
 
-	if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux))
+	if (bpf_prog_is_dev_bound(env->prog->aux))
 		return 0;
 
 	insn = env->prog->insnsi + delta;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
+		bpf_convert_ctx_access_t convert_ctx_access;
+
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
@@ -5363,8 +5445,18 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			continue;
 		}
 
-		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
+		switch (env->insn_aux_data[i + delta].ptr_type) {
+		case PTR_TO_CTX:
+			if (!ops->convert_ctx_access)
+				continue;
+			convert_ctx_access = ops->convert_ctx_access;
+			break;
+		case PTR_TO_SOCKET:
+			convert_ctx_access = bpf_sock_convert_ctx_access;
+			break;
+		default:
 			continue;
+		}
 
 		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
 		size = BPF_LDST_BYTES(insn);
@@ -5396,8 +5488,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 
 		target_size = 0;
-		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
-					      &target_size);
+		cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
+					 &target_size);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
 		    (ctx_field_size && !target_size)) {
 			verbose(env, "bpf verifier is misconfigured\n");
diff --git a/net/core/filter.c b/net/core/filter.c
index 72db8afb7cb6..b2cb186252e4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5394,23 +5394,29 @@ static bool __sock_filter_check_size(int off, int size,
 	return size == size_default;
 }
 
-static bool sock_filter_is_valid_access(int off, int size,
-					enum bpf_access_type type,
-					const struct bpf_prog *prog,
-					struct bpf_insn_access_aux *info)
+bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+			      struct bpf_insn_access_aux *info)
 {
 	if (off < 0 || off >= sizeof(struct bpf_sock))
 		return false;
 	if (off % size != 0)
 		return false;
-	if (!__sock_filter_check_attach_type(off, type,
-					     prog->expected_attach_type))
-		return false;
 	if (!__sock_filter_check_size(off, size, info))
 		return false;
 	return true;
 }
 
+static bool sock_filter_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					const struct bpf_prog *prog,
+					struct bpf_insn_access_aux *info)
+{
+	if (!bpf_sock_is_valid_access(off, size, type, info))
+		return false;
+	return __sock_filter_check_attach_type(off, type,
+					       prog->expected_attach_type);
+}
+
 static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
 				const struct bpf_prog *prog, int drop_verdict)
 {
@@ -6122,10 +6128,10 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
-					  const struct bpf_insn *si,
-					  struct bpf_insn *insn_buf,
-					  struct bpf_prog *prog, u32 *target_size)
+u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+				const struct bpf_insn *si,
+				struct bpf_insn *insn_buf,
+				struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
 	int off;
@@ -7037,7 +7043,7 @@ const struct bpf_prog_ops lwt_seg6local_prog_ops = {
 const struct bpf_verifier_ops cg_sock_verifier_ops = {
 	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
-	.convert_ctx_access	= sock_filter_convert_ctx_access,
+	.convert_ctx_access	= bpf_sock_convert_ctx_access,
 };
 
 const struct bpf_prog_ops cg_sock_prog_ops = {
-- 
cgit v1.2.3


From fd978bf7fd312581a7ca454a991f0ffb34c4204b Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:35 -0700
Subject: bpf: Add reference tracking to verifier

Allow helper functions to acquire a reference and return it into a
register. Specific pointer types such as the PTR_TO_SOCKET will
implicitly represent such a reference. The verifier must ensure that
these references are released exactly once in each path through the
program.

To achieve this, this commit assigns an id to the pointer and tracks it
in the 'bpf_func_state', then when the function or program exits,
verifies that all of the acquired references have been freed. When the
pointer is passed to a function that frees the reference, it is removed
from the 'bpf_func_state` and all existing copies of the pointer in
registers are marked invalid.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  24 +++-
 kernel/bpf/verifier.c        | 306 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 308 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a411363098a5..7b6fd2ab3263 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -104,6 +104,17 @@ struct bpf_stack_state {
 	u8 slot_type[BPF_REG_SIZE];
 };
 
+struct bpf_reference_state {
+	/* Track each reference created with a unique id, even if the same
+	 * instruction creates the reference multiple times (eg, via CALL).
+	 */
+	int id;
+	/* Instruction where the allocation of this reference occurred. This
+	 * is used purely to inform the user of a reference leak.
+	 */
+	int insn_idx;
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
@@ -121,7 +132,9 @@ struct bpf_func_state {
 	 */
 	u32 subprogno;
 
-	/* should be second to last. See copy_func_state() */
+	/* The following fields should be last. See copy_func_state() */
+	int acquired_refs;
+	struct bpf_reference_state *refs;
 	int allocated_stack;
 	struct bpf_stack_state *stack;
 };
@@ -217,11 +230,16 @@ __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 
-static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *cur = env->cur_state;
 
-	return cur->frame[cur->curframe]->regs;
+	return cur->frame[cur->curframe];
+}
+
+static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+{
+	return cur_func(env)->regs;
 }
 
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 11e982381061..cd0d8bc00bd1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,5 +1,6 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -140,6 +141,18 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  *
  * After the call R0 is set to return type of the function and registers R1-R5
  * are set to NOT_INIT to indicate that they are no longer readable.
+ *
+ * The following reference types represent a potential reference to a kernel
+ * resource which, after first being allocated, must be checked and freed by
+ * the BPF program:
+ * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET
+ *
+ * When the verifier sees a helper call return a reference type, it allocates a
+ * pointer id for the reference and stores it in the current function state.
+ * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into
+ * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
+ * passes through a NULL-check conditional. For the branch wherein the state is
+ * changed to CONST_IMM, the verifier releases the reference.
  */
 
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
@@ -189,6 +202,7 @@ struct bpf_call_arg_meta {
 	int access_size;
 	s64 msize_smax_value;
 	u64 msize_umax_value;
+	int ptr_id;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -251,7 +265,42 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
 
 static bool reg_type_may_be_null(enum bpf_reg_type type)
 {
-	return type == PTR_TO_MAP_VALUE_OR_NULL;
+	return type == PTR_TO_MAP_VALUE_OR_NULL ||
+	       type == PTR_TO_SOCKET_OR_NULL;
+}
+
+static bool type_is_refcounted(enum bpf_reg_type type)
+{
+	return type == PTR_TO_SOCKET;
+}
+
+static bool type_is_refcounted_or_null(enum bpf_reg_type type)
+{
+	return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL;
+}
+
+static bool reg_is_refcounted(const struct bpf_reg_state *reg)
+{
+	return type_is_refcounted(reg->type);
+}
+
+static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
+{
+	return type_is_refcounted_or_null(reg->type);
+}
+
+static bool arg_type_is_refcounted(enum bpf_arg_type type)
+{
+	return type == ARG_PTR_TO_SOCKET;
+}
+
+/* Determine whether the function releases some resources allocated by another
+ * function call. The first reference type argument will be assumed to be
+ * released by release_reference().
+ */
+static bool is_release_function(enum bpf_func_id func_id)
+{
+	return false;
 }
 
 /* string representation of 'enum bpf_reg_type' */
@@ -385,6 +434,12 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 		else
 			verbose(env, "=%s", types_buf);
 	}
+	if (state->acquired_refs && state->refs[0].id) {
+		verbose(env, " refs=%d", state->refs[0].id);
+		for (i = 1; i < state->acquired_refs; i++)
+			if (state->refs[i].id)
+				verbose(env, ",%d", state->refs[i].id);
+	}
 	verbose(env, "\n");
 }
 
@@ -403,6 +458,8 @@ static int copy_##NAME##_state(struct bpf_func_state *dst,		\
 	       sizeof(*src->FIELD) * (src->COUNT / SIZE));		\
 	return 0;							\
 }
+/* copy_reference_state() */
+COPY_STATE_FN(reference, acquired_refs, refs, 1)
 /* copy_stack_state() */
 COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
 #undef COPY_STATE_FN
@@ -441,6 +498,8 @@ static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \
 	state->FIELD = new_##FIELD;					\
 	return 0;							\
 }
+/* realloc_reference_state() */
+REALLOC_STATE_FN(reference, acquired_refs, refs, 1)
 /* realloc_stack_state() */
 REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
 #undef REALLOC_STATE_FN
@@ -452,16 +511,89 @@ REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
  * which realloc_stack_state() copies over. It points to previous
  * bpf_verifier_state which is never reallocated.
  */
-static int realloc_func_state(struct bpf_func_state *state, int size,
-			      bool copy_old)
+static int realloc_func_state(struct bpf_func_state *state, int stack_size,
+			      int refs_size, bool copy_old)
 {
-	return realloc_stack_state(state, size, copy_old);
+	int err = realloc_reference_state(state, refs_size, copy_old);
+	if (err)
+		return err;
+	return realloc_stack_state(state, stack_size, copy_old);
+}
+
+/* Acquire a pointer id from the env and update the state->refs to include
+ * this new pointer reference.
+ * On success, returns a valid pointer id to associate with the register
+ * On failure, returns a negative errno.
+ */
+static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
+{
+	struct bpf_func_state *state = cur_func(env);
+	int new_ofs = state->acquired_refs;
+	int id, err;
+
+	err = realloc_reference_state(state, state->acquired_refs + 1, true);
+	if (err)
+		return err;
+	id = ++env->id_gen;
+	state->refs[new_ofs].id = id;
+	state->refs[new_ofs].insn_idx = insn_idx;
+
+	return id;
+}
+
+/* release function corresponding to acquire_reference_state(). Idempotent. */
+static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
+{
+	int i, last_idx;
+
+	if (!ptr_id)
+		return -EFAULT;
+
+	last_idx = state->acquired_refs - 1;
+	for (i = 0; i < state->acquired_refs; i++) {
+		if (state->refs[i].id == ptr_id) {
+			if (last_idx && i != last_idx)
+				memcpy(&state->refs[i], &state->refs[last_idx],
+				       sizeof(*state->refs));
+			memset(&state->refs[last_idx], 0, sizeof(*state->refs));
+			state->acquired_refs--;
+			return 0;
+		}
+	}
+	return -EFAULT;
+}
+
+/* variation on the above for cases where we expect that there must be an
+ * outstanding reference for the specified ptr_id.
+ */
+static int release_reference_state(struct bpf_verifier_env *env, int ptr_id)
+{
+	struct bpf_func_state *state = cur_func(env);
+	int err;
+
+	err = __release_reference_state(state, ptr_id);
+	if (WARN_ON_ONCE(err != 0))
+		verbose(env, "verifier internal error: can't release reference\n");
+	return err;
+}
+
+static int transfer_reference_state(struct bpf_func_state *dst,
+				    struct bpf_func_state *src)
+{
+	int err = realloc_reference_state(dst, src->acquired_refs, false);
+	if (err)
+		return err;
+	err = copy_reference_state(dst, src);
+	if (err)
+		return err;
+	return 0;
 }
 
 static void free_func_state(struct bpf_func_state *state)
 {
 	if (!state)
 		return;
+	kfree(state->refs);
 	kfree(state->stack);
 	kfree(state);
 }
@@ -487,10 +619,14 @@ static int copy_func_state(struct bpf_func_state *dst,
 {
 	int err;
 
-	err = realloc_func_state(dst, src->allocated_stack, false);
+	err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs,
+				 false);
+	if (err)
+		return err;
+	memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
+	err = copy_reference_state(dst, src);
 	if (err)
 		return err;
-	memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack));
 	return copy_stack_state(dst, src);
 }
 
@@ -1015,7 +1151,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
 	enum bpf_reg_type type;
 
 	err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
-				 true);
+				 state->acquired_refs, true);
 	if (err)
 		return err;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -1399,7 +1535,8 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
 {
 	const struct bpf_reg_state *reg = cur_regs(env) + regno;
 
-	return reg->type == PTR_TO_CTX;
+	return reg->type == PTR_TO_CTX ||
+	       reg->type == PTR_TO_SOCKET;
 }
 
 static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
@@ -2003,6 +2140,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_SOCKET;
 		if (type != expected_type)
 			goto err_type;
+		if (meta->ptr_id || !reg->id) {
+			verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n",
+				meta->ptr_id, reg->id);
+			return -EFAULT;
+		}
+		meta->ptr_id = reg->id;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
@@ -2292,10 +2435,32 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
 	return true;
 }
 
+static bool check_refcount_ok(const struct bpf_func_proto *fn)
+{
+	int count = 0;
+
+	if (arg_type_is_refcounted(fn->arg1_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg2_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg3_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg4_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg5_type))
+		count++;
+
+	/* We only support one arg being unreferenced at the moment,
+	 * which is sufficient for the helper functions we have right now.
+	 */
+	return count <= 1;
+}
+
 static int check_func_proto(const struct bpf_func_proto *fn)
 {
 	return check_raw_mode_ok(fn) &&
-	       check_arg_pair_ok(fn) ? 0 : -EINVAL;
+	       check_arg_pair_ok(fn) &&
+	       check_refcount_ok(fn) ? 0 : -EINVAL;
 }
 
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
@@ -2328,12 +2493,45 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		__clear_all_pkt_pointers(env, vstate->frame[i]);
 }
 
+static void release_reg_references(struct bpf_verifier_env *env,
+				   struct bpf_func_state *state, int id)
+{
+	struct bpf_reg_state *regs = state->regs, *reg;
+	int i;
+
+	for (i = 0; i < MAX_BPF_REG; i++)
+		if (regs[i].id == id)
+			mark_reg_unknown(env, regs, i);
+
+	bpf_for_each_spilled_reg(i, state, reg) {
+		if (!reg)
+			continue;
+		if (reg_is_refcounted(reg) && reg->id == id)
+			__mark_reg_unknown(reg);
+	}
+}
+
+/* The pointer with the specified id has released its reference to kernel
+ * resources. Identify all copies of the same pointer and clear the reference.
+ */
+static int release_reference(struct bpf_verifier_env *env,
+			     struct bpf_call_arg_meta *meta)
+{
+	struct bpf_verifier_state *vstate = env->cur_state;
+	int i;
+
+	for (i = 0; i <= vstate->curframe; i++)
+		release_reg_references(env, vstate->frame[i], meta->ptr_id);
+
+	return release_reference_state(env, meta->ptr_id);
+}
+
 static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			   int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_func_state *caller, *callee;
-	int i, subprog, target_insn;
+	int i, err, subprog, target_insn;
 
 	if (state->curframe + 1 >= MAX_CALL_FRAMES) {
 		verbose(env, "the call stack of %d frames is too deep\n",
@@ -2371,6 +2569,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			state->curframe + 1 /* frameno within this callchain */,
 			subprog /* subprog number within this prog */);
 
+	/* Transfer references to the callee */
+	err = transfer_reference_state(callee, caller);
+	if (err)
+		return err;
+
 	/* copy r1 - r5 args that callee can access.  The copy includes parent
 	 * pointers, which connects us up to the liveness chain
 	 */
@@ -2403,6 +2606,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_func_state *caller, *callee;
 	struct bpf_reg_state *r0;
+	int err;
 
 	callee = state->frame[state->curframe];
 	r0 = &callee->regs[BPF_REG_0];
@@ -2422,6 +2626,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	/* return to the caller whatever r0 had in the callee */
 	caller->regs[BPF_REG_0] = *r0;
 
+	/* Transfer references to the caller */
+	err = transfer_reference_state(caller, callee);
+	if (err)
+		return err;
+
 	*insn_idx = callee->callsite + 1;
 	if (env->log.level) {
 		verbose(env, "returning from callee:\n");
@@ -2478,6 +2687,18 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	return 0;
 }
 
+static int check_reference_leak(struct bpf_verifier_env *env)
+{
+	struct bpf_func_state *state = cur_func(env);
+	int i;
+
+	for (i = 0; i < state->acquired_refs; i++) {
+		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
+			state->refs[i].id, state->refs[i].insn_idx);
+	}
+	return state->acquired_refs ? -EINVAL : 0;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	const struct bpf_func_proto *fn = NULL;
@@ -2556,6 +2777,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 			return err;
 	}
 
+	if (func_id == BPF_FUNC_tail_call) {
+		err = check_reference_leak(env);
+		if (err) {
+			verbose(env, "tail_call would lead to reference leak\n");
+			return err;
+		}
+	} else if (is_release_function(func_id)) {
+		err = release_reference(env, &meta);
+		if (err)
+			return err;
+	}
+
 	regs = cur_regs(env);
 
 	/* check that flags argument in get_local_storage(map, flags) is 0,
@@ -2599,9 +2832,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
 		regs[BPF_REG_0].id = ++env->id_gen;
 	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+		int id = acquire_reference_state(env, insn_idx);
+		if (id < 0)
+			return id;
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
-		regs[BPF_REG_0].id = ++env->id_gen;
+		regs[BPF_REG_0].id = id;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -3665,7 +3901,8 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src,
 	}
 }
 
-static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
+static void mark_ptr_or_null_reg(struct bpf_func_state *state,
+				 struct bpf_reg_state *reg, u32 id,
 				 bool is_null)
 {
 	if (reg_type_may_be_null(reg->type) && reg->id == id) {
@@ -3691,11 +3928,13 @@ static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
 		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
 			reg->type = PTR_TO_SOCKET;
 		}
-		/* We don't need id from this point onwards anymore, thus we
-		 * should better reset it, so that state pruning has chances
-		 * to take effect.
-		 */
-		reg->id = 0;
+		if (is_null || !reg_is_refcounted(reg)) {
+			/* We don't need id from this point onwards anymore,
+			 * thus we should better reset it, so that state
+			 * pruning has chances to take effect.
+			 */
+			reg->id = 0;
+		}
 	}
 }
 
@@ -3710,15 +3949,18 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
 	u32 id = regs[regno].id;
 	int i, j;
 
+	if (reg_is_refcounted_or_null(&regs[regno]) && is_null)
+		__release_reference_state(state, id);
+
 	for (i = 0; i < MAX_BPF_REG; i++)
-		mark_ptr_or_null_reg(&regs[i], id, is_null);
+		mark_ptr_or_null_reg(state, &regs[i], id, is_null);
 
 	for (j = 0; j <= vstate->curframe; j++) {
 		state = vstate->frame[j];
 		bpf_for_each_spilled_reg(i, state, reg) {
 			if (!reg)
 				continue;
-			mark_ptr_or_null_reg(reg, id, is_null);
+			mark_ptr_or_null_reg(state, reg, id, is_null);
 		}
 	}
 }
@@ -4050,6 +4292,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	if (err)
 		return err;
 
+	/* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as
+	 * gen_ld_abs() may terminate the program at runtime, leading to
+	 * reference leak.
+	 */
+	err = check_reference_leak(env);
+	if (err) {
+		verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
+		return err;
+	}
+
 	if (regs[BPF_REG_6].type != PTR_TO_CTX) {
 		verbose(env,
 			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -4542,6 +4794,14 @@ static bool stacksafe(struct bpf_func_state *old,
 	return true;
 }
 
+static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
+{
+	if (old->acquired_refs != cur->acquired_refs)
+		return false;
+	return !memcmp(old->refs, cur->refs,
+		       sizeof(*old->refs) * old->acquired_refs);
+}
+
 /* compare two verifier states
  *
  * all states stored in state_list are known to be valid, since
@@ -4587,6 +4847,9 @@ static bool func_states_equal(struct bpf_func_state *old,
 
 	if (!stacksafe(old, cur, idmap))
 		goto out_free;
+
+	if (!refsafe(old, cur))
+		goto out_free;
 	ret = true;
 out_free:
 	kfree(idmap);
@@ -4868,6 +5131,7 @@ static int do_check(struct bpf_verifier_env *env)
 
 		regs = cur_regs(env);
 		env->insn_aux_data[insn_idx].seen = true;
+
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
 			if (err)
@@ -5032,6 +5296,10 @@ static int do_check(struct bpf_verifier_env *env)
 					continue;
 				}
 
+				err = check_reference_leak(env);
+				if (err)
+					return err;
+
 				/* eBPF calling convetion is such that R0 is used
 				 * to return the value from eBPF program.
 				 * Make sure that it's readable at this time
-- 
cgit v1.2.3


From 6acc9b432e6714d72d7d77ec7c27f6f8358d0c71 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:36 -0700
Subject: bpf: Add helper to retrieve socket in BPF

This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and
bpf_sk_lookup_udp() which allows BPF programs to find out if there is a
socket listening on this host, and returns a socket pointer which the
BPF program can then access to determine, for instance, whether to
forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the
socket, so when a BPF program makes use of this function, it must
subsequently pass the returned pointer into the newly added sk_release()
to return the reference.

By way of example, the following pseudocode would filter inbound
connections at XDP if there is no corresponding service listening for
the traffic:

  struct bpf_sock_tuple tuple;
  struct bpf_sock_ops *sk;

  populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
  sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0);
  if (!sk) {
    // Couldn't find a socket listening for this traffic. Drop.
    return TC_ACT_SHOT;
  }
  bpf_sk_release(sk, 0);
  return TC_ACT_OK;

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h                  |  93 +++++++++++++++++-
 kernel/bpf/verifier.c                     |   8 +-
 net/core/filter.c                         | 151 ++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            |  93 +++++++++++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  12 +++
 5 files changed, 354 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e2070d819e04..f9187b41dff6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2144,6 +2144,77 @@ union bpf_attr {
  *		request in the skb.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for TCP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for UDP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * int bpf_sk_release(struct bpf_sock *sk)
+ *	Description
+ *		Release the reference held by *sock*. *sock* must be a non-NULL
+ *		pointer that was returned from bpf_sk_lookup_xxx\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2229,7 +2300,10 @@ union bpf_attr {
 	FN(get_current_cgroup_id),	\
 	FN(get_local_storage),		\
 	FN(sk_select_reuseport),	\
-	FN(skb_ancestor_cgroup_id),
+	FN(skb_ancestor_cgroup_id),	\
+	FN(sk_lookup_tcp),		\
+	FN(sk_lookup_udp),		\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2399,6 +2473,23 @@ struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cd0d8bc00bd1..73c81bef6ae8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
  * passes through a NULL-check conditional. For the branch wherein the state is
  * changed to CONST_IMM, the verifier releases the reference.
+ *
+ * For each helper function that allocates a reference, such as
+ * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
+ * bpf_sk_release(). When a reference type passes into the release function,
+ * the verifier also releases the reference. If any unchecked or unreleased
+ * reference remains at the end of the program, the verifier rejects it.
  */
 
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
@@ -300,7 +306,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type)
  */
 static bool is_release_function(enum bpf_func_id func_id)
 {
-	return false;
+	return func_id == BPF_FUNC_sk_release;
 }
 
 /* string representation of 'enum bpf_reg_type' */
diff --git a/net/core/filter.c b/net/core/filter.c
index b2cb186252e4..591c698bc517 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -58,13 +58,17 @@
 #include <net/busy_poll.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
+#include <net/udp.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
 #include <linux/inetdevice.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
 #include <net/ip_fib.h>
 #include <net/flow.h>
 #include <net/arp.h>
 #include <net/ipv6.h>
+#include <net/net_namespace.h>
 #include <linux/seg6_local.h>
 #include <net/seg6.h>
 #include <net/seg6_local.h>
@@ -4813,6 +4817,141 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
 };
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
+struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
+		       struct sk_buff *skb, u8 family, u8 proto)
+{
+	int dif = skb->dev->ifindex;
+	bool refcounted = false;
+	struct sock *sk = NULL;
+
+	if (family == AF_INET) {
+		__be32 src4 = tuple->ipv4.saddr;
+		__be32 dst4 = tuple->ipv4.daddr;
+		int sdif = inet_sdif(skb);
+
+		if (proto == IPPROTO_TCP)
+			sk = __inet_lookup(net, &tcp_hashinfo, skb, 0,
+					   src4, tuple->ipv4.sport,
+					   dst4, tuple->ipv4.dport,
+					   dif, sdif, &refcounted);
+		else
+			sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
+					       dst4, tuple->ipv4.dport,
+					       dif, sdif, &udp_table, skb);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
+		struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
+		int sdif = inet6_sdif(skb);
+
+		if (proto == IPPROTO_TCP)
+			sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0,
+					    src6, tuple->ipv6.sport,
+					    dst6, tuple->ipv6.dport,
+					    dif, sdif, &refcounted);
+		else
+			sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport,
+					       dst6, tuple->ipv6.dport,
+					       dif, sdif, &udp_table, skb);
+#endif
+	}
+
+	if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
+		WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+		sk = NULL;
+	}
+	return sk;
+}
+
+/* bpf_sk_lookup performs the core lookup for different types of sockets,
+ * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
+ * Returns the socket as an 'unsigned long' to simplify the casting in the
+ * callers to satisfy BPF_CALL declarations.
+ */
+static unsigned long
+bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+	      u8 proto, u64 netns_id, u64 flags)
+{
+	struct net *caller_net;
+	struct sock *sk = NULL;
+	u8 family = AF_UNSPEC;
+	struct net *net;
+
+	family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
+	if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
+		goto out;
+
+	if (skb->dev)
+		caller_net = dev_net(skb->dev);
+	else
+		caller_net = sock_net(skb->sk);
+	if (netns_id) {
+		net = get_net_ns_by_id(caller_net, netns_id);
+		if (unlikely(!net))
+			goto out;
+		sk = sk_lookup(net, tuple, skb, family, proto);
+		put_net(net);
+	} else {
+		net = caller_net;
+		sk = sk_lookup(net, tuple, skb, family, proto);
+	}
+
+	if (sk)
+		sk = sk_to_full_sk(sk);
+out:
+	return (unsigned long) sk;
+}
+
+BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+	return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
+	.func		= bpf_sk_lookup_tcp,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+	return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
+	.func		= bpf_sk_lookup_udp,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_sk_release, struct sock *, sk)
+{
+	if (!sock_flag(sk, SOCK_RCU_FREE))
+		sock_gen_put(sk);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_release_proto = {
+	.func		= bpf_sk_release,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_SOCKET,
+};
+
 bool bpf_helper_changes_pkt_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
@@ -5019,6 +5158,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_ancestor_cgroup_id:
 		return &bpf_skb_ancestor_cgroup_id_proto;
 #endif
+	case BPF_FUNC_sk_lookup_tcp:
+		return &bpf_sk_lookup_tcp_proto;
+	case BPF_FUNC_sk_lookup_udp:
+		return &bpf_sk_lookup_udp_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -5119,6 +5264,12 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_redirect_hash_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
+	case BPF_FUNC_sk_lookup_tcp:
+		return &bpf_sk_lookup_tcp_proto;
+	case BPF_FUNC_sk_lookup_udp:
+		return &bpf_sk_lookup_udp_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e2070d819e04..f9187b41dff6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2144,6 +2144,77 @@ union bpf_attr {
  *		request in the skb.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for TCP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for UDP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * int bpf_sk_release(struct bpf_sock *sk)
+ *	Description
+ *		Release the reference held by *sock*. *sock* must be a non-NULL
+ *		pointer that was returned from bpf_sk_lookup_xxx\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2229,7 +2300,10 @@ union bpf_attr {
 	FN(get_current_cgroup_id),	\
 	FN(get_local_storage),		\
 	FN(sk_select_reuseport),	\
-	FN(skb_ancestor_cgroup_id),
+	FN(skb_ancestor_cgroup_id),	\
+	FN(sk_lookup_tcp),		\
+	FN(sk_lookup_udp),		\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2399,6 +2473,23 @@ struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index e4be7730222d..1d407b3494f9 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -143,6 +143,18 @@ static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) =
 	(void *) BPF_FUNC_skb_cgroup_id;
 static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) =
 	(void *) BPF_FUNC_skb_ancestor_cgroup_id;
+static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx,
+					     struct bpf_sock_tuple *tuple,
+					     int size, unsigned int netns_id,
+					     unsigned long long flags) =
+	(void *) BPF_FUNC_sk_lookup_tcp;
+static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
+					     struct bpf_sock_tuple *tuple,
+					     int size, unsigned int netns_id,
+					     unsigned long long flags) =
+	(void *) BPF_FUNC_sk_lookup_udp;
+static int (*bpf_sk_release)(struct bpf_sock *sk) =
+	(void *) BPF_FUNC_sk_release;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3


From 8873c064d1de579ea23412a6d3eee972593f142b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 1 Oct 2018 23:24:26 -0700
Subject: tcp: do not release socket ownership in tcp_close()

syzkaller was able to hit the WARN_ON(sock_owned_by_user(sk));
in tcp_close()

While a socket is being closed, it is very possible other
threads find it in rtnetlink dump.

tcp_get_info() will acquire the socket lock for a short amount
of time (slow = lock_sock_fast(sk)/unlock_sock_fast(sk, slow);),
enough to trigger the warning.

Fixes: 67db3e4bfbc9 ("tcp: no longer hold ehash lock while calling tcp_get_info()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h |  1 +
 net/core/sock.c    |  2 +-
 net/ipv4/tcp.c     | 11 +++--------
 3 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 38cae35f6e16..751549ac0a84 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1492,6 +1492,7 @@ static inline void lock_sock(struct sock *sk)
 	lock_sock_nested(sk, 0);
 }
 
+void __release_sock(struct sock *sk);
 void release_sock(struct sock *sk);
 
 /* BH context may only use the following locking interface. */
diff --git a/net/core/sock.c b/net/core/sock.c
index 8537b6ca72c5..7e8796a6a089 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2317,7 +2317,7 @@ static void __lock_sock(struct sock *sk)
 	finish_wait(&sk->sk_lock.wq, &wait);
 }
 
-static void __release_sock(struct sock *sk)
+void __release_sock(struct sock *sk)
 	__releases(&sk->sk_lock.slock)
 	__acquires(&sk->sk_lock.slock)
 {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2827fa5643bd..43ef83b2330e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2416,16 +2416,10 @@ adjudge_to_death:
 	sock_hold(sk);
 	sock_orphan(sk);
 
-	/* It is the last release_sock in its life. It will remove backlog. */
-	release_sock(sk);
-
-
-	/* Now socket is owned by kernel and we acquire BH lock
-	 *  to finish close. No need to check for user refs.
-	 */
 	local_bh_disable();
 	bh_lock_sock(sk);
-	WARN_ON(sock_owned_by_user(sk));
+	/* remove backlog if any, without releasing ownership. */
+	__release_sock(sk);
 
 	percpu_counter_inc(sk->sk_prot->orphan_count);
 
@@ -2494,6 +2488,7 @@ adjudge_to_death:
 out:
 	bh_unlock_sock(sk);
 	local_bh_enable();
+	release_sock(sk);
 	sock_put(sk);
 }
 EXPORT_SYMBOL(tcp_close);
-- 
cgit v1.2.3


From f3edc2dbe0ad0bbbd8450cd37328f99acf215fd8 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Mon, 1 Oct 2018 17:02:43 +0100
Subject: net: usbnet: make driver_info const

The driver_info field that is used for describing each of the usb-net
drivers using the usbnet.c core all declare their information as const
and the usbnet.c itself does not try and modify the struct.

It is therefore a good idea to make this const in the usbnet.c structure
in case anyone tries to modify it.

Signed-off-by: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c   | 12 ++++++------
 include/linux/usb/usbnet.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 73aa33364d80..504282af27e5 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -802,7 +802,7 @@ static void usbnet_terminate_urbs(struct usbnet *dev)
 int usbnet_stop (struct net_device *net)
 {
 	struct usbnet		*dev = netdev_priv(net);
-	struct driver_info	*info = dev->driver_info;
+	const struct driver_info *info = dev->driver_info;
 	int			retval, pm, mpn;
 
 	clear_bit(EVENT_DEV_OPEN, &dev->flags);
@@ -865,7 +865,7 @@ int usbnet_open (struct net_device *net)
 {
 	struct usbnet		*dev = netdev_priv(net);
 	int			retval;
-	struct driver_info	*info = dev->driver_info;
+	const struct driver_info *info = dev->driver_info;
 
 	if ((retval = usb_autopm_get_interface(dev->intf)) < 0) {
 		netif_info(dev, ifup, dev->net,
@@ -1205,7 +1205,7 @@ fail_lowmem:
 	}
 
 	if (test_bit (EVENT_LINK_RESET, &dev->flags)) {
-		struct driver_info	*info = dev->driver_info;
+		const struct driver_info *info = dev->driver_info;
 		int			retval = 0;
 
 		clear_bit (EVENT_LINK_RESET, &dev->flags);
@@ -1353,7 +1353,7 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 	unsigned int			length;
 	struct urb		*urb = NULL;
 	struct skb_data		*entry;
-	struct driver_info	*info = dev->driver_info;
+	const struct driver_info *info = dev->driver_info;
 	unsigned long		flags;
 	int retval;
 
@@ -1647,7 +1647,7 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 	struct usbnet			*dev;
 	struct net_device		*net;
 	struct usb_host_interface	*interface;
-	struct driver_info		*info;
+	const struct driver_info	*info;
 	struct usb_device		*xdev;
 	int				status;
 	const char			*name;
@@ -1663,7 +1663,7 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 	}
 
 	name = udev->dev.driver->name;
-	info = (struct driver_info *) prod->driver_info;
+	info = (const struct driver_info *) prod->driver_info;
 	if (!info) {
 		dev_dbg (&udev->dev, "blacklisted by %s\n", name);
 		return -ENODEV;
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index e2ec3582e549..d8860f2d0976 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -28,7 +28,7 @@ struct usbnet {
 	/* housekeeping */
 	struct usb_device	*udev;
 	struct usb_interface	*intf;
-	struct driver_info	*driver_info;
+	const struct driver_info *driver_info;
 	const char		*driver_name;
 	void			*driver_priv;
 	wait_queue_head_t	wait;
-- 
cgit v1.2.3


From 4e6d47206c32d1bbb4931f1d851dae3870e0df81 Mon Sep 17 00:00:00 2001
From: Vakul Garg <vakul.garg@nxp.com>
Date: Sun, 30 Sep 2018 08:04:35 +0530
Subject: tls: Add support for inplace records encryption

Presently, for non-zero copy case, separate pages are allocated for
storing plaintext and encrypted text of records. These pages are stored
in sg_plaintext_data and sg_encrypted_data scatterlists inside record
structure. Further, sg_plaintext_data & sg_encrypted_data are passed
to cryptoapis for record encryption. Allocating separate pages for
plaintext and encrypted text is inefficient from both required memory
and performance point of view.

This patch adds support of inplace encryption of records. For non-zero
copy case, we reuse the pages from sg_encrypted_data scatterlist to
copy the application's plaintext data. For the movement of pages from
sg_encrypted_data to sg_plaintext_data scatterlists, we introduce a new
function move_to_plaintext_sg(). This function add pages into
sg_plaintext_data from sg_encrypted_data scatterlists.

tls_do_encryption() is modified to pass the same scatterlist as both
source and destination into aead_request_set_crypt() if inplace crypto
has been enabled. A new ariable 'inplace_crypto' has been introduced in
record structure to signify whether the same scatterlist can be used.
By default, the inplace_crypto is enabled in get_rec(). If zero-copy is
used (i.e. plaintext data is not copied), inplace_crypto is set to '0'.

Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
Reviewed-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h |  1 +
 net/tls/tls_sw.c  | 91 ++++++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 74 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 262420cdad10..5e853835597e 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -101,6 +101,7 @@ struct tls_rec {
 	struct list_head list;
 	int tx_ready;
 	int tx_flags;
+	int inplace_crypto;
 
 	/* AAD | sg_plaintext_data | sg_tag */
 	struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS + 1];
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 1d4c354d5516..aa9fdce272b6 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -281,24 +281,72 @@ static int alloc_encrypted_sg(struct sock *sk, int len)
 	return rc;
 }
 
-static int alloc_plaintext_sg(struct sock *sk, int len)
+static int move_to_plaintext_sg(struct sock *sk, int required_size)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct tls_rec *rec = ctx->open_rec;
-	int rc = 0;
+	struct scatterlist *plain_sg = &rec->sg_plaintext_data[1];
+	struct scatterlist *enc_sg = &rec->sg_encrypted_data[1];
+	int enc_sg_idx = 0;
+	int skip, len;
 
-	rc = sk_alloc_sg(sk, len,
-			 &rec->sg_plaintext_data[1], 0,
-			 &rec->sg_plaintext_num_elem,
-			 &rec->sg_plaintext_size,
-			 tls_ctx->pending_open_record_frags);
+	if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS)
+		return -ENOSPC;
 
-	if (rc == -ENOSPC)
-		rec->sg_plaintext_num_elem =
-			ARRAY_SIZE(rec->sg_plaintext_data) - 1;
+	/* We add page references worth len bytes from enc_sg at the
+	 * end of plain_sg. It is guaranteed that sg_encrypted_data
+	 * has enough required room (ensured by caller).
+	 */
+	len = required_size - rec->sg_plaintext_size;
 
-	return rc;
+	/* Skip initial bytes in sg_encrypted_data to be able
+	 * to use same offset of both plain and encrypted data.
+	 */
+	skip = tls_ctx->tx.prepend_size + rec->sg_plaintext_size;
+
+	while (enc_sg_idx < rec->sg_encrypted_num_elem) {
+		if (enc_sg[enc_sg_idx].length > skip)
+			break;
+
+		skip -= enc_sg[enc_sg_idx].length;
+		enc_sg_idx++;
+	}
+
+	/* unmark the end of plain_sg*/
+	sg_unmark_end(plain_sg + rec->sg_plaintext_num_elem - 1);
+
+	while (len) {
+		struct page *page = sg_page(&enc_sg[enc_sg_idx]);
+		int bytes = enc_sg[enc_sg_idx].length - skip;
+		int offset = enc_sg[enc_sg_idx].offset + skip;
+
+		if (bytes > len)
+			bytes = len;
+		else
+			enc_sg_idx++;
+
+		/* Skipping is required only one time */
+		skip = 0;
+
+		/* Increment page reference */
+		get_page(page);
+
+		sg_set_page(&plain_sg[rec->sg_plaintext_num_elem], page,
+			    bytes, offset);
+
+		sk_mem_charge(sk, bytes);
+
+		len -= bytes;
+		rec->sg_plaintext_size += bytes;
+
+		rec->sg_plaintext_num_elem++;
+
+		if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS)
+			return -ENOSPC;
+	}
+
+	return 0;
 }
 
 static void free_sg(struct sock *sk, struct scatterlist *sg,
@@ -459,16 +507,21 @@ static int tls_do_encryption(struct sock *sk,
 			     size_t data_len)
 {
 	struct tls_rec *rec = ctx->open_rec;
+	struct scatterlist *plain_sg = rec->sg_plaintext_data;
+	struct scatterlist *enc_sg = rec->sg_encrypted_data;
 	int rc;
 
 	/* Skip the first index as it contains AAD data */
 	rec->sg_encrypted_data[1].offset += tls_ctx->tx.prepend_size;
 	rec->sg_encrypted_data[1].length -= tls_ctx->tx.prepend_size;
 
+	/* If it is inplace crypto, then pass same SG list as both src, dst */
+	if (rec->inplace_crypto)
+		plain_sg = enc_sg;
+
 	aead_request_set_tfm(aead_req, ctx->aead_send);
 	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
-	aead_request_set_crypt(aead_req, rec->sg_plaintext_data,
-			       rec->sg_encrypted_data,
+	aead_request_set_crypt(aead_req, plain_sg, enc_sg,
 			       data_len, tls_ctx->tx.iv);
 
 	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
@@ -666,6 +719,7 @@ static struct tls_rec *get_rec(struct sock *sk)
 		   sizeof(rec->aad_space));
 
 	ctx->open_rec = rec;
+	rec->inplace_crypto = 1;
 
 	return rec;
 }
@@ -763,6 +817,8 @@ alloc_encrypted:
 			if (ret)
 				goto fallback_to_reg_send;
 
+			rec->inplace_crypto = 0;
+
 			num_zc++;
 			copied += try_to_copy;
 			ret = tls_push_record(sk, msg->msg_flags, record_type);
@@ -782,11 +838,11 @@ fallback_to_reg_send:
 		}
 
 		required_size = rec->sg_plaintext_size + try_to_copy;
-alloc_plaintext:
-		ret = alloc_plaintext_sg(sk, required_size);
+
+		ret = move_to_plaintext_sg(sk, required_size);
 		if (ret) {
 			if (ret != -ENOSPC)
-				goto wait_for_memory;
+				goto send_end;
 
 			/* Adjust try_to_copy according to the amount that was
 			 * actually allocated. The difference is due
@@ -831,8 +887,6 @@ trim_sgl:
 
 		if (rec->sg_encrypted_size < required_size)
 			goto alloc_encrypted;
-
-		goto alloc_plaintext;
 	}
 
 	if (!num_async) {
@@ -958,6 +1012,7 @@ alloc_payload:
 		if (full_record || eor ||
 		    rec->sg_plaintext_num_elem ==
 		    ARRAY_SIZE(rec->sg_plaintext_data) - 1) {
+			rec->inplace_crypto = 0;
 			ret = tls_push_record(sk, flags, record_type);
 			if (ret) {
 				if (ret == -EINPROGRESS)
-- 
cgit v1.2.3


From e9837e55b0200da544a095a1fca36efd7fd3ba30 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Mon, 1 Oct 2018 18:23:08 -0700
Subject: netfilter: xt_quota: fix the behavior of xt_quota module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A major flaw of the current xt_quota module is that quota in a specific
rule gets reset every time there is a rule change in the same table. It
makes the xt_quota module not very useful in a table in which iptables
rules are changed at run time. This fix introduces a new counter that is
visible to userspace as the remaining quota of the current rule. When
userspace restores the rules in a table, it can restore the counter to
the remaining quota instead of resetting it to the full quota.

Signed-off-by: Chenbo Feng <fengc@google.com>
Suggested-by: Maciej Żenczykowski <maze@google.com>
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_quota.h |  8 +++--
 net/netfilter/xt_quota.c                | 55 +++++++++++++--------------------
 2 files changed, 27 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/xt_quota.h b/include/uapi/linux/netfilter/xt_quota.h
index f3ba5d9e58b6..d72fd52adbba 100644
--- a/include/uapi/linux/netfilter/xt_quota.h
+++ b/include/uapi/linux/netfilter/xt_quota.h
@@ -15,9 +15,11 @@ struct xt_quota_info {
 	__u32 flags;
 	__u32 pad;
 	__aligned_u64 quota;
-
-	/* Used internally by the kernel */
-	struct xt_quota_priv	*master;
+#ifdef __KERNEL__
+	atomic64_t counter;
+#else
+	__aligned_u64 remain;
+#endif
 };
 
 #endif /* _XT_QUOTA_H */
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 10d61a6eed71..6afa7f468a73 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -11,11 +11,6 @@
 #include <linux/netfilter/xt_quota.h>
 #include <linux/module.h>
 
-struct xt_quota_priv {
-	spinlock_t	lock;
-	uint64_t	quota;
-};
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
 MODULE_DESCRIPTION("Xtables: countdown quota match");
@@ -26,54 +21,48 @@ static bool
 quota_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	struct xt_quota_info *q = (void *)par->matchinfo;
-	struct xt_quota_priv *priv = q->master;
+	u64 current_count = atomic64_read(&q->counter);
 	bool ret = q->flags & XT_QUOTA_INVERT;
-
-	spin_lock_bh(&priv->lock);
-	if (priv->quota >= skb->len) {
-		priv->quota -= skb->len;
-		ret = !ret;
-	} else {
-		/* we do not allow even small packets from now on */
-		priv->quota = 0;
-	}
-	spin_unlock_bh(&priv->lock);
-
-	return ret;
+	u64 old_count, new_count;
+
+	do {
+		if (current_count == 1)
+			return ret;
+		if (current_count <= skb->len) {
+			atomic64_set(&q->counter, 1);
+			return ret;
+		}
+		old_count = current_count;
+		new_count = current_count - skb->len;
+		current_count = atomic64_cmpxchg(&q->counter, old_count,
+						 new_count);
+	} while (current_count != old_count);
+	return !ret;
 }
 
 static int quota_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_quota_info *q = par->matchinfo;
 
+	BUILD_BUG_ON(sizeof(atomic64_t) != sizeof(__aligned_u64));
+
 	if (q->flags & ~XT_QUOTA_MASK)
 		return -EINVAL;
+	if (atomic64_read(&q->counter) > q->quota + 1)
+		return -ERANGE;
 
-	q->master = kmalloc(sizeof(*q->master), GFP_KERNEL);
-	if (q->master == NULL)
-		return -ENOMEM;
-
-	spin_lock_init(&q->master->lock);
-	q->master->quota = q->quota;
+	if (atomic64_read(&q->counter) == 0)
+		atomic64_set(&q->counter, q->quota + 1);
 	return 0;
 }
 
-static void quota_mt_destroy(const struct xt_mtdtor_param *par)
-{
-	const struct xt_quota_info *q = par->matchinfo;
-
-	kfree(q->master);
-}
-
 static struct xt_match quota_mt_reg __read_mostly = {
 	.name       = "quota",
 	.revision   = 0,
 	.family     = NFPROTO_UNSPEC,
 	.match      = quota_mt,
 	.checkentry = quota_mt_check,
-	.destroy    = quota_mt_destroy,
 	.matchsize  = sizeof(struct xt_quota_info),
-	.usersize   = offsetof(struct xt_quota_info, master),
 	.me         = THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From 16fc087b9cb22c9a97307cc24a5413d0df68fe11 Mon Sep 17 00:00:00 2001
From: Yashaswini Raghuram Prathivadi Bhayankaram
 <yashaswini.raghuram.prathivadi.bhayankaram@intel.com>
Date: Fri, 10 Aug 2018 00:17:44 -0700
Subject: virtchnl: Added support to exchange additional speed values

Introduced a new virtchnl capability flag and a struct to support exchange
of additional supported speeds.

Signed-off-by: Yashaswini Raghuram Prathivadi Bhayankaram <yashaswini.raghuram.prathivadi.bhayankaram@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/avf/virtchnl.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index b41f7bc958ef..2c9756bd9c4c 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -252,6 +252,8 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM	0X00400000
 #define VIRTCHNL_VF_OFFLOAD_ADQ			0X00800000
 
+/* Define below the capability flags that are not offloads */
+#define VIRTCHNL_VF_CAP_ADV_LINK_SPEED		0x00000080
 #define VF_BASE_MODE_OFFLOADS (VIRTCHNL_VF_OFFLOAD_L2 | \
 			       VIRTCHNL_VF_OFFLOAD_VLAN | \
 			       VIRTCHNL_VF_OFFLOAD_RSS_PF)
@@ -596,10 +598,23 @@ enum virtchnl_event_codes {
 struct virtchnl_pf_event {
 	enum virtchnl_event_codes event;
 	union {
+		/* If the PF driver does not support the new speed reporting
+		 * capabilities then use link_event else use link_event_adv to
+		 * get the speed and link information. The ability to understand
+		 * new speeds is indicated by setting the capability flag
+		 * VIRTCHNL_VF_CAP_ADV_LINK_SPEED in vf_cap_flags parameter
+		 * in virtchnl_vf_resource struct and can be used to determine
+		 * which link event struct to use below.
+		 */
 		struct {
 			enum virtchnl_link_speed link_speed;
 			bool link_status;
 		} link_event;
+		struct {
+			/* link_speed provided in Mbps */
+			u32 link_speed;
+			u8 link_status;
+		} link_event_adv;
 	} event_data;
 
 	int severity;
-- 
cgit v1.2.3


From db7ff19e7b119adb4618fbc6410b441d1c3b55c5 Mon Sep 17 00:00:00 2001
From: Eli Britstein <elibr@mellanox.com>
Date: Wed, 15 Aug 2018 16:02:18 +0300
Subject: devlink: Add extack for eswitch operations

Add extack argument to the eswitch related operations.

Signed-off-by: Eli Britstein <elibr@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c         |  3 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h         |  3 ++-
 drivers/net/ethernet/cavium/liquidio/lio_main.c       |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h     |  9 ++++++---
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c    | 19 ++++++++++++-------
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c      |  3 ++-
 include/net/devlink.h                                 |  9 ++++++---
 net/core/devlink.c                                    |  8 +++++---
 8 files changed, 37 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
index b574fe8e974e..9a25c05aa571 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
@@ -521,7 +521,8 @@ int bnxt_dl_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 	return 0;
 }
 
-int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode)
+int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode,
+			     struct netlink_ext_ack *extack)
 {
 	struct bnxt *bp = bnxt_get_bp_from_dl(devlink);
 	int rc = 0;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h
index 38b9a75ad724..d7287651422f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h
@@ -30,7 +30,8 @@ static inline u16 bnxt_vf_rep_get_fid(struct net_device *dev)
 
 bool bnxt_dev_is_vf_rep(struct net_device *dev);
 int bnxt_dl_eswitch_mode_get(struct devlink *devlink, u16 *mode);
-int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode);
+int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode,
+			     struct netlink_ext_ack *extack);
 
 #else
 
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 9d70e5c6157f..3d24133e5e49 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -3144,7 +3144,8 @@ liquidio_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 }
 
 static int
-liquidio_eswitch_mode_set(struct devlink *devlink, u16 mode)
+liquidio_eswitch_mode_set(struct devlink *devlink, u16 mode,
+			  struct netlink_ext_ack *extack)
 {
 	struct lio_devlink_priv *priv;
 	struct octeon_device *oct;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 0b05bf2b91f6..dfc642de4e6d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -269,12 +269,15 @@ struct mlx5_esw_flow_attr {
 	struct mlx5e_tc_flow_parse_attr *parse_attr;
 };
 
-int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode);
+int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
+				  struct netlink_ext_ack *extack);
 int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode);
-int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode);
+int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode,
+					 struct netlink_ext_ack *extack);
 int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode);
 int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode);
-int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap);
+int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap,
+					struct netlink_ext_ack *extack);
 int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap);
 void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 21e957083f65..eee34ffcf242 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -810,7 +810,8 @@ out:
 	return flow_rule;
 }
 
-static int esw_offloads_start(struct mlx5_eswitch *esw)
+static int esw_offloads_start(struct mlx5_eswitch *esw,
+			      struct netlink_ext_ack *extack)
 {
 	int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs;
 
@@ -973,7 +974,8 @@ create_ft_err:
 	return err;
 }
 
-static int esw_offloads_stop(struct mlx5_eswitch *esw)
+static int esw_offloads_stop(struct mlx5_eswitch *esw,
+			     struct netlink_ext_ack *extack)
 {
 	int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs;
 
@@ -1092,7 +1094,8 @@ static int mlx5_devlink_eswitch_check(struct devlink *devlink)
 	return 0;
 }
 
-int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
+int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
+				  struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	u16 cur_mlx5_mode, mlx5_mode = 0;
@@ -1111,9 +1114,9 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
 		return 0;
 
 	if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV)
-		return esw_offloads_start(dev->priv.eswitch);
+		return esw_offloads_start(dev->priv.eswitch, extack);
 	else if (mode == DEVLINK_ESWITCH_MODE_LEGACY)
-		return esw_offloads_stop(dev->priv.eswitch);
+		return esw_offloads_stop(dev->priv.eswitch, extack);
 	else
 		return -EINVAL;
 }
@@ -1130,7 +1133,8 @@ int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 	return esw_mode_to_devlink(dev->priv.eswitch->mode, mode);
 }
 
-int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode)
+int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode,
+					 struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	struct mlx5_eswitch *esw = dev->priv.eswitch;
@@ -1232,7 +1236,8 @@ out:
 	return 0;
 }
 
-int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap)
+int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap,
+					struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	struct mlx5_eswitch *esw = dev->priv.eswitch;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index db463e20a876..4213fe42ac4d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -177,7 +177,8 @@ static int nfp_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 	return nfp_app_eswitch_mode_get(pf->app, mode);
 }
 
-static int nfp_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
+static int nfp_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
+					struct netlink_ext_ack *extack)
 {
 	struct nfp_pf *pf = devlink_priv(devlink);
 	int ret;
diff --git a/include/net/devlink.h b/include/net/devlink.h
index b9b89d6604d4..70671f0d4c30 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -451,11 +451,14 @@ struct devlink_ops {
 				       u32 *p_cur, u32 *p_max);
 
 	int (*eswitch_mode_get)(struct devlink *devlink, u16 *p_mode);
-	int (*eswitch_mode_set)(struct devlink *devlink, u16 mode);
+	int (*eswitch_mode_set)(struct devlink *devlink, u16 mode,
+				struct netlink_ext_ack *extack);
 	int (*eswitch_inline_mode_get)(struct devlink *devlink, u8 *p_inline_mode);
-	int (*eswitch_inline_mode_set)(struct devlink *devlink, u8 inline_mode);
+	int (*eswitch_inline_mode_set)(struct devlink *devlink, u8 inline_mode,
+				       struct netlink_ext_ack *extack);
 	int (*eswitch_encap_mode_get)(struct devlink *devlink, u8 *p_encap_mode);
-	int (*eswitch_encap_mode_set)(struct devlink *devlink, u8 encap_mode);
+	int (*eswitch_encap_mode_set)(struct devlink *devlink, u8 encap_mode,
+				      struct netlink_ext_ack *extack);
 };
 
 static inline void *devlink_priv(struct devlink *devlink)
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8c0ed225e280..de6adad7ccbe 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1626,7 +1626,7 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
 		if (!ops->eswitch_mode_set)
 			return -EOPNOTSUPP;
 		mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
-		err = ops->eswitch_mode_set(devlink, mode);
+		err = ops->eswitch_mode_set(devlink, mode, info->extack);
 		if (err)
 			return err;
 	}
@@ -1636,7 +1636,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
 			return -EOPNOTSUPP;
 		inline_mode = nla_get_u8(
 				info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
-		err = ops->eswitch_inline_mode_set(devlink, inline_mode);
+		err = ops->eswitch_inline_mode_set(devlink, inline_mode,
+						   info->extack);
 		if (err)
 			return err;
 	}
@@ -1645,7 +1646,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
 		if (!ops->eswitch_encap_mode_set)
 			return -EOPNOTSUPP;
 		encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]);
-		err = ops->eswitch_encap_mode_set(devlink, encap_mode);
+		err = ops->eswitch_encap_mode_set(devlink, encap_mode,
+						  info->extack);
 		if (err)
 			return err;
 	}
-- 
cgit v1.2.3


From fcd29ad17c6ff885dfae58f557e9323941e63ba2 Mon Sep 17 00:00:00 2001
From: Feras Daoud <ferasda@mellanox.com>
Date: Thu, 9 Aug 2018 09:55:21 +0300
Subject: net/mlx5: Add Fast teardown support

Today mlx5 devices support two teardown modes:
1- Regular teardown
2- Force teardown

This change introduces the enhanced version of the "Force teardown" that
allows SW to perform teardown in a faster way without the need to reclaim
all the pages.

Fast teardown provides the following advantages:
1- Fix a FW race condition that could cause command timeout
2- Avoid moving to polling mode
3- Close the vport to prevent PCI ACK to be sent without been scatter
to memory

Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c       | 50 +++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/health.c   | 25 ++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 29 +++++++++----
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    | 12 ++++++
 include/linux/mlx5/device.h                        |  4 ++
 include/linux/mlx5/mlx5_ifc.h                      |  6 ++-
 6 files changed, 103 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 41ad24f0de2c..1ab6f7e3bec6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -250,7 +250,7 @@ int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev)
 	if (ret)
 		return ret;
 
-	force_state = MLX5_GET(teardown_hca_out, out, force_state);
+	force_state = MLX5_GET(teardown_hca_out, out, state);
 	if (force_state == MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL) {
 		mlx5_core_warn(dev, "teardown with force mode failed, doing normal teardown\n");
 		return -EIO;
@@ -259,6 +259,54 @@ int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev)
 	return 0;
 }
 
+#define MLX5_FAST_TEARDOWN_WAIT_MS   3000
+int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev)
+{
+	unsigned long end, delay_ms = MLX5_FAST_TEARDOWN_WAIT_MS;
+	u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {0};
+	int state;
+	int ret;
+
+	if (!MLX5_CAP_GEN(dev, fast_teardown)) {
+		mlx5_core_dbg(dev, "fast teardown is not supported in the firmware\n");
+		return -EOPNOTSUPP;
+	}
+
+	MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA);
+	MLX5_SET(teardown_hca_in, in, profile,
+		 MLX5_TEARDOWN_HCA_IN_PROFILE_PREPARE_FAST_TEARDOWN);
+
+	ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (ret)
+		return ret;
+
+	state = MLX5_GET(teardown_hca_out, out, state);
+	if (state == MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL) {
+		mlx5_core_warn(dev, "teardown with fast mode failed\n");
+		return -EIO;
+	}
+
+	mlx5_set_nic_state(dev, MLX5_NIC_IFC_DISABLED);
+
+	/* Loop until device state turns to disable */
+	end = jiffies + msecs_to_jiffies(delay_ms);
+	do {
+		if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
+			break;
+
+		cond_resched();
+	} while (!time_after(jiffies, end));
+
+	if (mlx5_get_nic_state(dev) != MLX5_NIC_IFC_DISABLED) {
+		dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n",
+			mlx5_get_nic_state(dev), delay_ms);
+		return -EIO;
+	}
+
+	return 0;
+}
+
 enum mlxsw_reg_mcc_instruction {
 	MLX5_REG_MCC_INSTRUCTION_LOCK_UPDATE_HANDLE = 0x01,
 	MLX5_REG_MCC_INSTRUCTION_RELEASE_UPDATE_HANDLE = 0x02,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 9f39aeca863f..43118de8ee99 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -58,23 +58,26 @@ enum {
 	MLX5_HEALTH_SYNDR_HIGH_TEMP		= 0x10
 };
 
-enum {
-	MLX5_NIC_IFC_FULL		= 0,
-	MLX5_NIC_IFC_DISABLED		= 1,
-	MLX5_NIC_IFC_NO_DRAM_NIC	= 2,
-	MLX5_NIC_IFC_INVALID		= 3
-};
-
 enum {
 	MLX5_DROP_NEW_HEALTH_WORK,
 	MLX5_DROP_NEW_RECOVERY_WORK,
 };
 
-static u8 get_nic_state(struct mlx5_core_dev *dev)
+u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
 {
 	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
 }
 
+void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
+{
+	u32 cur_cmdq_addr_l_sz;
+
+	cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz);
+	iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) |
+		    state << MLX5_NIC_IFC_OFFSET,
+		    &dev->iseg->cmdq_addr_l_sz);
+}
+
 static void trigger_cmd_completions(struct mlx5_core_dev *dev)
 {
 	unsigned long flags;
@@ -103,7 +106,7 @@ static int in_fatal(struct mlx5_core_dev *dev)
 	struct mlx5_core_health *health = &dev->priv.health;
 	struct health_buffer __iomem *h = health->health;
 
-	if (get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
+	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
 		return 1;
 
 	if (ioread32be(&h->fw_ver) == 0xffffffff)
@@ -133,7 +136,7 @@ unlock:
 
 static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
 {
-	u8 nic_interface = get_nic_state(dev);
+	u8 nic_interface = mlx5_get_nic_state(dev);
 
 	switch (nic_interface) {
 	case MLX5_NIC_IFC_FULL:
@@ -168,7 +171,7 @@ static void health_recover(struct work_struct *work)
 	priv = container_of(health, struct mlx5_priv, health);
 	dev = container_of(priv, struct mlx5_core_dev, priv);
 
-	nic_state = get_nic_state(dev);
+	nic_state = mlx5_get_nic_state(dev);
 	if (nic_state == MLX5_NIC_IFC_INVALID) {
 		dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n");
 		return;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index b5e9f664fc66..28132c7dc05f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1594,12 +1594,17 @@ static const struct pci_error_handlers mlx5_err_handler = {
 
 static int mlx5_try_fast_unload(struct mlx5_core_dev *dev)
 {
-	int ret;
+	bool fast_teardown = false, force_teardown = false;
+	int ret = 1;
+
+	fast_teardown = MLX5_CAP_GEN(dev, fast_teardown);
+	force_teardown = MLX5_CAP_GEN(dev, force_teardown);
+
+	mlx5_core_dbg(dev, "force teardown firmware support=%d\n", force_teardown);
+	mlx5_core_dbg(dev, "fast teardown firmware support=%d\n", fast_teardown);
 
-	if (!MLX5_CAP_GEN(dev, force_teardown)) {
-		mlx5_core_dbg(dev, "force teardown is not supported in the firmware\n");
+	if (!fast_teardown && !force_teardown)
 		return -EOPNOTSUPP;
-	}
 
 	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
 		mlx5_core_dbg(dev, "Device in internal error state, giving up\n");
@@ -1612,13 +1617,19 @@ static int mlx5_try_fast_unload(struct mlx5_core_dev *dev)
 	mlx5_drain_health_wq(dev);
 	mlx5_stop_health_poll(dev, false);
 
+	ret = mlx5_cmd_fast_teardown_hca(dev);
+	if (!ret)
+		goto succeed;
+
 	ret = mlx5_cmd_force_teardown_hca(dev);
-	if (ret) {
-		mlx5_core_dbg(dev, "Firmware couldn't do fast unload error: %d\n", ret);
-		mlx5_start_health_poll(dev);
-		return ret;
-	}
+	if (!ret)
+		goto succeed;
+
+	mlx5_core_dbg(dev, "Firmware couldn't do fast unload error: %d\n", ret);
+	mlx5_start_health_poll(dev);
+	return ret;
 
+succeed:
 	mlx5_enter_error_state(dev, true);
 
 	/* Some platforms requiring freeing the IRQ's in the shutdown
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index b4134fa0bba3..cc298527baf1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -95,6 +95,8 @@ int mlx5_query_board_id(struct mlx5_core_dev *dev);
 int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id);
 int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev);
+int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev);
+
 void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 		     unsigned long param);
 void mlx5_core_page_fault(struct mlx5_core_dev *dev,
@@ -214,4 +216,14 @@ int mlx5_lag_allow(struct mlx5_core_dev *dev);
 int mlx5_lag_forbid(struct mlx5_core_dev *dev);
 
 void mlx5_reload_interface(struct mlx5_core_dev *mdev, int protocol);
+
+enum {
+	MLX5_NIC_IFC_FULL		= 0,
+	MLX5_NIC_IFC_DISABLED		= 1,
+	MLX5_NIC_IFC_NO_DRAM_NIC	= 2,
+	MLX5_NIC_IFC_INVALID		= 3
+};
+
+u8 mlx5_get_nic_state(struct mlx5_core_dev *dev);
+void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state);
 #endif /* __MLX5_CORE_H__ */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 11fa4e66afc5..e9b502d5bcc1 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -504,6 +504,10 @@ struct health_buffer {
 	__be16		ext_synd;
 };
 
+enum mlx5_cmd_addr_l_sz_offset {
+	MLX5_NIC_IFC_OFFSET = 8,
+};
+
 struct mlx5_init_seg {
 	__be32			fw_rev;
 	__be32			cmdif_rev_fw_sub;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f043d65b9bac..6e8a882052b1 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -896,7 +896,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_mkey[0x6];
 	u8         reserved_at_f0[0x8];
 	u8         dump_fill_mkey[0x1];
-	u8         reserved_at_f9[0x3];
+	u8         reserved_at_f9[0x2];
+	u8         fast_teardown[0x1];
 	u8         log_max_eq[0x4];
 
 	u8         max_indirection[0x8];
@@ -3352,12 +3353,13 @@ struct mlx5_ifc_teardown_hca_out_bits {
 
 	u8         reserved_at_40[0x3f];
 
-	u8         force_state[0x1];
+	u8         state[0x1];
 };
 
 enum {
 	MLX5_TEARDOWN_HCA_IN_PROFILE_GRACEFUL_CLOSE  = 0x0,
 	MLX5_TEARDOWN_HCA_IN_PROFILE_FORCE_CLOSE     = 0x1,
+	MLX5_TEARDOWN_HCA_IN_PROFILE_PREPARE_FAST_TEARDOWN = 0x2,
 };
 
 struct mlx5_ifc_teardown_hca_in_bits {
-- 
cgit v1.2.3


From 2070a3e44962212d6ef02c5def821b1b9744e496 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 4 Oct 2018 09:42:29 +0100
Subject: rxrpc: Allow the reply time to be obtained on a client call

Allow the timestamp on the sk_buff holding the first DATA packet of a reply
to be queried.  This can then be used as a base for the expiry time
calculation on the callback promise duration indicated by an operation
result.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/networking/rxrpc.txt | 11 ++++++++++
 include/net/af_rxrpc.h             |  3 +++
 net/rxrpc/recvmsg.c                | 43 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
index b5407163d53b..67879992b4c2 100644
--- a/Documentation/networking/rxrpc.txt
+++ b/Documentation/networking/rxrpc.txt
@@ -1069,6 +1069,17 @@ The kernel interface functions are as follows:
 
      This function may transmit a PING ACK.
 
+ (*) Get reply timestamp.
+
+	bool rxrpc_kernel_get_reply_time(struct socket *sock,
+					 struct rxrpc_call *call,
+					 ktime_t *_ts)
+
+     This allows the timestamp on the first DATA packet of the reply of a
+     client call to be queried, provided that it is still in the Rx ring.  If
+     successful, the timestamp will be stored into *_ts and true will be
+     returned; false will be returned otherwise.
+
 
 =======================
 CONFIGURABLE PARAMETERS
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index f53edb3754bc..c4c912554dee 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -13,6 +13,7 @@
 #define _NET_RXRPC_H
 
 #include <linux/rxrpc.h>
+#include <linux/ktime.h>
 
 struct key;
 struct sock;
@@ -77,5 +78,7 @@ int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *,
 int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *,
 			    enum rxrpc_call_completion *, u32 *);
 u32 rxrpc_kernel_check_life(struct socket *, struct rxrpc_call *);
+bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *,
+				 ktime_t *);
 
 #endif /* _NET_RXRPC_H */
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 816b19a78809..eaf19ebaa964 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -715,3 +715,46 @@ call_complete:
 	goto out;
 }
 EXPORT_SYMBOL(rxrpc_kernel_recv_data);
+
+/**
+ * rxrpc_kernel_get_reply_time - Get timestamp on first reply packet
+ * @sock: The socket that the call exists on
+ * @call: The call to query
+ * @_ts: Where to put the timestamp
+ *
+ * Retrieve the timestamp from the first DATA packet of the reply if it is
+ * in the ring.  Returns true if successful, false if not.
+ */
+bool rxrpc_kernel_get_reply_time(struct socket *sock, struct rxrpc_call *call,
+				 ktime_t *_ts)
+{
+	struct sk_buff *skb;
+	rxrpc_seq_t hard_ack, top, seq;
+	bool success = false;
+
+	mutex_lock(&call->user_mutex);
+
+	if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_RECV_REPLY)
+		goto out;
+
+	hard_ack = call->rx_hard_ack;
+	if (hard_ack != 0)
+		goto out;
+
+	seq = hard_ack + 1;
+	top = smp_load_acquire(&call->rx_top);
+	if (after(seq, top))
+		goto out;
+
+	skb = call->rxtx_buffer[seq & RXRPC_RXTX_BUFF_MASK];
+	if (!skb)
+		goto out;
+
+	*_ts = skb_get_ktime(skb);
+	success = true;
+
+out:
+	mutex_unlock(&call->user_mutex);
+	return success;
+}
+EXPORT_SYMBOL(rxrpc_kernel_get_reply_time);
-- 
cgit v1.2.3


From e908bcf4f1a271e7c264dcbffc5881ced8bfacee Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 4 Oct 2018 09:54:29 +0100
Subject: rxrpc: Allow the reply time to be obtained on a client call

Allow the epoch value to be queried on a server connection.  This is in the
rxrpc header of every packet for use in routing and is derived from the
client's state.  It's also not supposed to change unless the client gets
restarted.

AFS can make use of this information to deduce whether a fileserver has
been restarted because the fileserver makes client calls to the filesystem
driver's cache manager to send notifications (ie. callback breaks) about
conflicting changes from other clients.  These convey the fileserver's own
epoch value back to the filesystem.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/networking/rxrpc.txt | 14 ++++++++++++++
 include/net/af_rxrpc.h             |  1 +
 net/rxrpc/af_rxrpc.c               | 14 ++++++++++++++
 3 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
index 67879992b4c2..605e00cdd6be 100644
--- a/Documentation/networking/rxrpc.txt
+++ b/Documentation/networking/rxrpc.txt
@@ -1080,6 +1080,20 @@ The kernel interface functions are as follows:
      successful, the timestamp will be stored into *_ts and true will be
      returned; false will be returned otherwise.
 
+ (*) Get remote client epoch.
+
+	u32 rxrpc_kernel_get_epoch(struct socket *sock,
+				   struct rxrpc_call *call)
+
+     This allows the epoch that's contained in packets of an incoming client
+     call to be queried.  This value is returned.  The function always
+     successful if the call is still in progress.  It shouldn't be called once
+     the call has expired.  Note that calling this on a local client call only
+     returns the local epoch.
+
+     This value can be used to determine if the remote client has been
+     restarted as it shouldn't change otherwise.
+
 
 =======================
 CONFIGURABLE PARAMETERS
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index c4c912554dee..de587948042a 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -78,6 +78,7 @@ int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *,
 int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *,
 			    enum rxrpc_call_completion *, u32 *);
 u32 rxrpc_kernel_check_life(struct socket *, struct rxrpc_call *);
+u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *);
 bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *,
 				 ktime_t *);
 
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 2fdd276f6842..013dbcb052e5 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -385,6 +385,20 @@ u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call)
 }
 EXPORT_SYMBOL(rxrpc_kernel_check_life);
 
+/**
+ * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call.
+ * @sock: The socket the call is on
+ * @call: The call to query
+ *
+ * Allow a kernel service to retrieve the epoch value from a service call to
+ * see if the client at the other end rebooted.
+ */
+u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call)
+{
+	return call->conn->proto.epoch;
+}
+EXPORT_SYMBOL(rxrpc_kernel_get_epoch);
+
 /**
  * rxrpc_kernel_check_call - Check a call's state
  * @sock: The socket the call is on
-- 
cgit v1.2.3


From bbb4c4323a4d9cb5ca04db904aa3050a7586839a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 4 Oct 2018 14:27:55 +0100
Subject: dns: Allow the dns resolver to retrieve a server set

Allow the DNS resolver to retrieve a set of servers and their associated
addresses, ports, preference and weight ratings.

In terms of communication with userspace, "srv=1" is added to the callout
string (the '1' indicating the maximum data version supported by the
kernel) to ask the userspace side for this.

If the userspace side doesn't recognise it, it will ignore the option and
return the usual text address list.

If the userspace side does recognise it, it will return some binary data
that begins with a zero byte that would cause the string parsers to give an
error.  The second byte contains the version of the data in the blob (this
may be between 1 and the version specified in the callout data).  The
remainder of the payload is version-specific.

In version 1, the payload looks like (note that this is packed):

	u8	Non-string marker (ie. 0)
	u8	Content (0 => Server list)
	u8	Version (ie. 1)
	u8	Source (eg. DNS_RECORD_FROM_DNS_SRV)
	u8	Status (eg. DNS_LOOKUP_GOOD)
	u8	Number of servers
	foreach-server {
		u16	Name length (LE)
		u16	Priority (as per SRV record) (LE)
		u16	Weight (as per SRV record) (LE)
		u16	Port (LE)
		u8	Source (eg. DNS_RECORD_FROM_NSS)
		u8	Status (eg. DNS_LOOKUP_GOT_NOT_FOUND)
		u8	Protocol (eg. DNS_SERVER_PROTOCOL_UDP)
		u8	Number of addresses
		char[]	Name (not NUL-terminated)
		foreach-address {
			u8		Family (AF_INET{,6})
			union {
				u8[4]	ipv4_addr
				u8[16]	ipv6_addr
			}
		}
	}

This can then be used to fetch a whole cell's VL-server configuration for
AFS, for example.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dns_resolver.h      |   4 +-
 include/uapi/linux/dns_resolver.h | 116 ++++++++++++++++++++++++++++++++++++++
 net/dns_resolver/dns_key.c        |  67 +++++++++++++++++++++-
 net/dns_resolver/dns_query.c      |   5 +-
 4 files changed, 182 insertions(+), 10 deletions(-)
 create mode 100644 include/uapi/linux/dns_resolver.h

(limited to 'include')

diff --git a/include/linux/dns_resolver.h b/include/linux/dns_resolver.h
index 6ac3cad9aef1..34a744a1bafc 100644
--- a/include/linux/dns_resolver.h
+++ b/include/linux/dns_resolver.h
@@ -24,11 +24,9 @@
 #ifndef _LINUX_DNS_RESOLVER_H
 #define _LINUX_DNS_RESOLVER_H
 
-#ifdef __KERNEL__
+#include <uapi/linux/dns_resolver.h>
 
 extern int dns_query(const char *type, const char *name, size_t namelen,
 		     const char *options, char **_result, time64_t *_expiry);
 
-#endif /* KERNEL */
-
 #endif /* _LINUX_DNS_RESOLVER_H */
diff --git a/include/uapi/linux/dns_resolver.h b/include/uapi/linux/dns_resolver.h
new file mode 100644
index 000000000000..129745f9c794
--- /dev/null
+++ b/include/uapi/linux/dns_resolver.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/* DNS resolver interface definitions.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_DNS_RESOLVER_H
+#define _UAPI_LINUX_DNS_RESOLVER_H
+
+#include <linux/types.h>
+
+/*
+ * Type of payload.
+ */
+enum dns_payload_content_type {
+	DNS_PAYLOAD_IS_SERVER_LIST	= 0, /* List of servers, requested by srv=1 */
+};
+
+/*
+ * Type of address that might be found in an address record.
+ */
+enum dns_payload_address_type {
+	DNS_ADDRESS_IS_IPV4		= 0, /* 4-byte AF_INET address */
+	DNS_ADDRESS_IS_IPV6		= 1, /* 16-byte AF_INET6 address */
+};
+
+/*
+ * Type of protocol used to access a server.
+ */
+enum dns_payload_protocol_type {
+	DNS_SERVER_PROTOCOL_UNSPECIFIED	= 0,
+	DNS_SERVER_PROTOCOL_UDP		= 1, /* Use UDP to talk to the server */
+	DNS_SERVER_PROTOCOL_TCP		= 2, /* Use TCP to talk to the server */
+};
+
+/*
+ * Source of record included in DNS resolver payload.
+ */
+enum dns_record_source {
+	DNS_RECORD_UNAVAILABLE		= 0, /* No source available (empty record) */
+	DNS_RECORD_FROM_CONFIG		= 1, /* From local configuration data */
+	DNS_RECORD_FROM_DNS_A		= 2, /* From DNS A or AAAA record */
+	DNS_RECORD_FROM_DNS_AFSDB	= 3, /* From DNS AFSDB record */
+	DNS_RECORD_FROM_DNS_SRV		= 4, /* From DNS SRV record */
+	DNS_RECORD_FROM_NSS		= 5, /* From NSS */
+	NR__dns_record_source
+};
+
+/*
+ * Status of record included in DNS resolver payload.
+ */
+enum dns_lookup_status {
+	DNS_LOOKUP_NOT_DONE		= 0, /* No lookup has been made */
+	DNS_LOOKUP_GOOD			= 1, /* Good records obtained */
+	DNS_LOOKUP_GOOD_WITH_BAD	= 2, /* Good records, some decoding errors */
+	DNS_LOOKUP_BAD			= 3, /* Couldn't decode results */
+	DNS_LOOKUP_GOT_NOT_FOUND	= 4, /* Got a "Not Found" result */
+	DNS_LOOKUP_GOT_LOCAL_FAILURE	= 5, /* Local failure during lookup */
+	DNS_LOOKUP_GOT_TEMP_FAILURE	= 6, /* Temporary failure during lookup */
+	DNS_LOOKUP_GOT_NS_FAILURE	= 7, /* Name server failure */
+	NR__dns_lookup_status
+};
+
+/*
+ * Header at the beginning of binary format payload.
+ */
+struct dns_payload_header {
+	__u8		zero;		/* Zero byte: marks this as not being text */
+	__u8		content;	/* enum dns_payload_content_type */
+	__u8		version;	/* Encoding version */
+} __packed;
+
+/*
+ * Header at the beginning of a V1 server list.  This is followed directly by
+ * the server records.  Each server records begins with a struct of type
+ * dns_server_list_v1_server.
+ */
+struct dns_server_list_v1_header {
+	struct dns_payload_header hdr;
+	__u8		source;		/* enum dns_record_source */
+	__u8		status;		/* enum dns_lookup_status */
+	__u8		nr_servers;	/* Number of server records following this */
+} __packed;
+
+/*
+ * Header at the beginning of each V1 server record.  This is followed by the
+ * characters of the name with no NUL-terminator, followed by the address
+ * records for that server.  Each address record begins with a struct of type
+ * struct dns_server_list_v1_address.
+ */
+struct dns_server_list_v1_server {
+	__u16		name_len;	/* Length of name (LE) */
+	__u16		priority;	/* Priority (as SRV record) (LE) */
+	__u16		weight;		/* Weight (as SRV record) (LE) */
+	__u16		port;		/* UDP/TCP port number (LE) */
+	__u8		source;		/* enum dns_record_source */
+	__u8		status;		/* enum dns_lookup_status */
+	__u8		protocol;	/* enum dns_payload_protocol_type */
+	__u8		nr_addrs;
+} __packed;
+
+/*
+ * Header at the beginning of each V1 address record.  This is followed by the
+ * bytes of the address, 4 for IPV4 and 16 for IPV6.
+ */
+struct dns_server_list_v1_address {
+	__u8		address_type;	/* enum dns_payload_address_type */
+} __packed;
+
+#endif /* _UAPI_LINUX_DNS_RESOLVER_H */
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 7f4534828f6c..a65d553e730d 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -29,6 +29,7 @@
 #include <linux/keyctl.h>
 #include <linux/err.h>
 #include <linux/seq_file.h>
+#include <linux/dns_resolver.h>
 #include <keys/dns_resolver-type.h>
 #include <keys/user-type.h>
 #include "internal.h"
@@ -48,27 +49,86 @@ const struct cred *dns_resolver_cache;
 /*
  * Preparse instantiation data for a dns_resolver key.
  *
- * The data must be a NUL-terminated string, with the NUL char accounted in
- * datalen.
+ * For normal hostname lookups, the data must be a NUL-terminated string, with
+ * the NUL char accounted in datalen.
  *
  * If the data contains a '#' characters, then we take the clause after each
  * one to be an option of the form 'key=value'.  The actual data of interest is
  * the string leading up to the first '#'.  For instance:
  *
  *        "ip1,ip2,...#foo=bar"
+ *
+ * For server list requests, the data must begin with a NUL char and be
+ * followed by a byte indicating the version of the data format.  Version 1
+ * looks something like (note this is packed):
+ *
+ *	u8      Non-string marker (ie. 0)
+ *	u8	Content (DNS_PAYLOAD_IS_*)
+ *	u8	Version (e.g. 1)
+ *	u8	Source of server list
+ *	u8	Lookup status of server list
+ *	u8	Number of servers
+ *	foreach-server {
+ *		__le16	Name length
+ *		__le16	Priority (as per SRV record, low first)
+ *		__le16	Weight (as per SRV record, higher first)
+ *		__le16	Port
+ *		u8	Source of address list
+ *		u8	Lookup status of address list
+ *		u8	Protocol (DNS_SERVER_PROTOCOL_*)
+ *		u8	Number of addresses
+ *		char[]	Name (not NUL-terminated)
+ *		foreach-address {
+ *			u8		Family (DNS_ADDRESS_IS_*)
+ *			union {
+ *				u8[4]	ipv4_addr
+ *				u8[16]	ipv6_addr
+ *			}
+ *		}
+ *	}
+ *
  */
 static int
 dns_resolver_preparse(struct key_preparsed_payload *prep)
 {
+	const struct dns_payload_header *bin;
 	struct user_key_payload *upayload;
 	unsigned long derrno;
 	int ret;
 	int datalen = prep->datalen, result_len = 0;
 	const char *data = prep->data, *end, *opt;
 
+	if (datalen <= 1 || !data)
+		return -EINVAL;
+
+	if (data[0] == 0) {
+		/* It may be a server list. */
+		if (datalen <= sizeof(*bin))
+			return -EINVAL;
+
+		bin = (const struct dns_payload_header *)data;
+		kenter("[%u,%u],%u", bin->content, bin->version, datalen);
+		if (bin->content != DNS_PAYLOAD_IS_SERVER_LIST) {
+			pr_warn_ratelimited(
+				"dns_resolver: Unsupported content type (%u)\n",
+				bin->content);
+			return -EINVAL;
+		}
+
+		if (bin->version != 1) {
+			pr_warn_ratelimited(
+				"dns_resolver: Unsupported server list version (%u)\n",
+				bin->version);
+			return -EINVAL;
+		}
+
+		result_len = datalen;
+		goto store_result;
+	}
+
 	kenter("'%*.*s',%u", datalen, datalen, data, datalen);
 
-	if (datalen <= 1 || !data || data[datalen - 1] != '\0')
+	if (!data || data[datalen - 1] != '\0')
 		return -EINVAL;
 	datalen--;
 
@@ -144,6 +204,7 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
 		return 0;
 	}
 
+store_result:
 	kdebug("store result");
 	prep->quotalen = result_len;
 
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 49da67034f29..76338c38738a 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -148,12 +148,9 @@ int dns_query(const char *type, const char *name, size_t namelen,
 
 	if (_result) {
 		ret = -ENOMEM;
-		*_result = kmalloc(len + 1, GFP_KERNEL);
+		*_result = kmemdup_nul(upayload->data, len, GFP_KERNEL);
 		if (!*_result)
 			goto put;
-
-		memcpy(*_result, upayload->data, len);
-		(*_result)[len] = '\0';
 	}
 
 	if (_expiry)
-- 
cgit v1.2.3


From e3b5106162a3f73c7633ae6051fbf244584ab584 Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Date: Thu, 4 Oct 2018 11:13:44 +0530
Subject: devlink: Add generic parameter ignore_ari

ignore_ari - Device ignores ARI(Alternate Routing ID) capability,
even when platforms has the support and creates same number of
partitions when platform does not support ARI capability.

Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 4 ++++
 net/core/devlink.c    | 5 +++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 70671f0d4c30..ae28ccbd6843 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -362,6 +362,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
 	DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
+	DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -380,6 +381,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME "region_snapshot_enable"
 #define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE DEVLINK_PARAM_TYPE_BOOL
 
+#define DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME "ignore_ari"
+#define DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE DEVLINK_PARAM_TYPE_BOOL
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/net/core/devlink.c b/net/core/devlink.c
index de6adad7ccbe..4a72fede11b4 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2677,6 +2677,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME,
 		.type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
+		.name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME,
+		.type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3


From f61cba4291c06c201b1b855a341b036caefdc2d6 Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Date: Thu, 4 Oct 2018 11:13:45 +0530
Subject: devlink: Add generic parameter msix_vec_per_pf_max

msix_vec_per_pf_max - This param sets the number of MSIX vectors
that the device requests from the host on driver initialization.
This value is set in the device which is applicable per PF.

Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 4 ++++
 net/core/devlink.c    | 5 +++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index ae28ccbd6843..c9b08b49957c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -363,6 +363,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
 	DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
 	DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
+	DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -384,6 +385,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME "ignore_ari"
 #define DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE DEVLINK_PARAM_TYPE_BOOL
 
+#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME "msix_vec_per_pf_max"
+#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE DEVLINK_PARAM_TYPE_U32
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 4a72fede11b4..d7501a588ad2 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2682,6 +2682,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME,
 		.type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
+		.name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME,
+		.type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3


From 16511789b9cc0a946611b1f9575b7a5b2b566301 Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Date: Thu, 4 Oct 2018 11:13:46 +0530
Subject: devlink: Add generic parameter msix_vec_per_pf_min

msix_vec_per_pf_min - This param sets the number of minimal MSIX
vectors required for the device initialization. This value is set
in the device which limits MSIX vectors per PF.

Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 4 ++++
 net/core/devlink.c    | 5 +++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index c9b08b49957c..9a70755ad1c2 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -364,6 +364,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
 	DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
 	DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
+	DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -388,6 +389,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME "msix_vec_per_pf_max"
 #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE DEVLINK_PARAM_TYPE_U32
 
+#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME "msix_vec_per_pf_min"
+#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE DEVLINK_PARAM_TYPE_U32
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d7501a588ad2..938f68ee92f0 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2687,6 +2687,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME,
 		.type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
+		.name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME,
+		.type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3


From 5a781ccbd19e4664babcbe4b4ead7aa2b9283d22 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Fri, 28 Sep 2018 17:59:43 -0700
Subject: tc: Add support for configuring the taprio scheduler

This traffic scheduler allows traffic classes states (transmission
allowed/not allowed, in the simplest case) to be scheduled, according
to a pre-generated time sequence. This is the basis of the IEEE
802.1Qbv specification.

Example configuration:

tc qdisc replace dev enp3s0 parent root handle 100 taprio \
          num_tc 3 \
	  map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \
	  queues 1@0 1@1 2@2 \
	  base-time 1528743495910289987 \
	  sched-entry S 01 300000 \
	  sched-entry S 02 300000 \
	  sched-entry S 04 300000 \
	  clockid CLOCK_TAI

The configuration format is similar to mqprio. The main difference is
the presence of a schedule, built by multiple "sched-entry"
definitions, each entry has the following format:

     sched-entry <CMD> <GATE MASK> <INTERVAL>

The only supported <CMD> is "S", which means "SetGateStates",
following the IEEE 802.1Qbv-2015 definition (Table 8-6). <GATE MASK>
is a bitmask where each bit is a associated with a traffic class, so
bit 0 (the least significant bit) being "on" means that traffic class
0 is "active" for that schedule entry. <INTERVAL> is a time duration
in nanoseconds that specifies for how long that state defined by <CMD>
and <GATE MASK> should be held before moving to the next entry.

This schedule is circular, that is, after the last entry is executed
it starts from the first one, indefinitely.

The other parameters can be defined as follows:

 - base-time: specifies the instant when the schedule starts, if
  'base-time' is a time in the past, the schedule will start at

 	      base-time + (N * cycle-time)

   where N is the smallest integer so the resulting time is greater
   than "now", and "cycle-time" is the sum of all the intervals of the
   entries in the schedule;

 - clockid: specifies the reference clock to be used;

The parameters should be similar to what the IEEE 802.1Q family of
specification defines.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  46 ++
 net/sched/Kconfig              |  11 +
 net/sched/Makefile             |   1 +
 net/sched/sch_taprio.c         | 962 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1020 insertions(+)
 create mode 100644 net/sched/sch_taprio.c

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index e9b7244ac381..89ee47c2f17d 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1084,4 +1084,50 @@ enum {
 	CAKE_ATM_MAX
 };
 
+
+/* TAPRIO */
+enum {
+	TC_TAPRIO_CMD_SET_GATES = 0x00,
+	TC_TAPRIO_CMD_SET_AND_HOLD = 0x01,
+	TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02,
+};
+
+enum {
+	TCA_TAPRIO_SCHED_ENTRY_UNSPEC,
+	TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */
+	TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */
+	TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */
+	TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */
+	__TCA_TAPRIO_SCHED_ENTRY_MAX,
+};
+#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1)
+
+/* The format for schedule entry list is:
+ * [TCA_TAPRIO_SCHED_ENTRY_LIST]
+ *   [TCA_TAPRIO_SCHED_ENTRY]
+ *     [TCA_TAPRIO_SCHED_ENTRY_CMD]
+ *     [TCA_TAPRIO_SCHED_ENTRY_GATES]
+ *     [TCA_TAPRIO_SCHED_ENTRY_INTERVAL]
+ */
+enum {
+	TCA_TAPRIO_SCHED_UNSPEC,
+	TCA_TAPRIO_SCHED_ENTRY,
+	__TCA_TAPRIO_SCHED_MAX,
+};
+
+#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1)
+
+enum {
+	TCA_TAPRIO_ATTR_UNSPEC,
+	TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
+	TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */
+	TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */
+	TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */
+	TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */
+	TCA_TAPRIO_PAD,
+	__TCA_TAPRIO_ATTR_MAX,
+};
+
+#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1)
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e95741388311..1b9afdee5ba9 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -194,6 +194,17 @@ config NET_SCH_ETF
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_etf.
 
+config NET_SCH_TAPRIO
+	tristate "Time Aware Priority (taprio) Scheduler"
+	help
+	  Say Y here if you want to use the Time Aware Priority (taprio) packet
+	  scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_taprio.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_taprio.
+
 config NET_SCH_GRED
 	tristate "Generic Random Early Detection (GRED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index f0403f49edcb..8a40431d7b5c 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
 obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
 obj-$(CONFIG_NET_SCH_ETF)	+= sch_etf.o
+obj-$(CONFIG_NET_SCH_TAPRIO)	+= sch_taprio.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
new file mode 100644
index 000000000000..206e4dbed12f
--- /dev/null
+++ b/net/sched/sch_taprio.c
@@ -0,0 +1,962 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* net/sched/sch_taprio.c	 Time Aware Priority Scheduler
+ *
+ * Authors:	Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/sch_generic.h>
+
+#define TAPRIO_ALL_GATES_OPEN -1
+
+struct sched_entry {
+	struct list_head list;
+
+	/* The instant that this entry "closes" and the next one
+	 * should open, the qdisc will make some effort so that no
+	 * packet leaves after this time.
+	 */
+	ktime_t close_time;
+	atomic_t budget;
+	int index;
+	u32 gate_mask;
+	u32 interval;
+	u8 command;
+};
+
+struct taprio_sched {
+	struct Qdisc **qdiscs;
+	struct Qdisc *root;
+	s64 base_time;
+	int clockid;
+	int picos_per_byte; /* Using picoseconds because for 10Gbps+
+			     * speeds it's sub-nanoseconds per byte
+			     */
+	size_t num_entries;
+
+	/* Protects the update side of the RCU protected current_entry */
+	spinlock_t current_entry_lock;
+	struct sched_entry __rcu *current_entry;
+	struct list_head entries;
+	ktime_t (*get_time)(void);
+	struct hrtimer advance_timer;
+};
+
+static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			  struct sk_buff **to_free)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct Qdisc *child;
+	int queue;
+
+	queue = skb_get_queue_mapping(skb);
+
+	child = q->qdiscs[queue];
+	if (unlikely(!child))
+		return qdisc_drop(skb, sch, to_free);
+
+	qdisc_qstats_backlog_inc(sch, skb);
+	sch->q.qlen++;
+
+	return qdisc_enqueue(skb, child, to_free);
+}
+
+static struct sk_buff *taprio_peek(struct Qdisc *sch)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct sched_entry *entry;
+	struct sk_buff *skb;
+	u32 gate_mask;
+	int i;
+
+	rcu_read_lock();
+	entry = rcu_dereference(q->current_entry);
+	gate_mask = entry ? entry->gate_mask : -1;
+	rcu_read_unlock();
+
+	if (!gate_mask)
+		return NULL;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct Qdisc *child = q->qdiscs[i];
+		int prio;
+		u8 tc;
+
+		if (unlikely(!child))
+			continue;
+
+		skb = child->ops->peek(child);
+		if (!skb)
+			continue;
+
+		prio = skb->priority;
+		tc = netdev_get_prio_tc_map(dev, prio);
+
+		if (!(gate_mask & BIT(tc)))
+			return NULL;
+
+		return skb;
+	}
+
+	return NULL;
+}
+
+static inline int length_to_duration(struct taprio_sched *q, int len)
+{
+	return (len * q->picos_per_byte) / 1000;
+}
+
+static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct sched_entry *entry;
+	struct sk_buff *skb;
+	u32 gate_mask;
+	int i;
+
+	rcu_read_lock();
+	entry = rcu_dereference(q->current_entry);
+	/* if there's no entry, it means that the schedule didn't
+	 * start yet, so force all gates to be open, this is in
+	 * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5
+	 * "AdminGateSates"
+	 */
+	gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
+	rcu_read_unlock();
+
+	if (!gate_mask)
+		return NULL;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct Qdisc *child = q->qdiscs[i];
+		ktime_t guard;
+		int prio;
+		int len;
+		u8 tc;
+
+		if (unlikely(!child))
+			continue;
+
+		skb = child->ops->peek(child);
+		if (!skb)
+			continue;
+
+		prio = skb->priority;
+		tc = netdev_get_prio_tc_map(dev, prio);
+
+		if (!(gate_mask & BIT(tc)))
+			continue;
+
+		len = qdisc_pkt_len(skb);
+		guard = ktime_add_ns(q->get_time(),
+				     length_to_duration(q, len));
+
+		/* In the case that there's no gate entry, there's no
+		 * guard band ...
+		 */
+		if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+		    ktime_after(guard, entry->close_time))
+			return NULL;
+
+		/* ... and no budget. */
+		if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+		    atomic_sub_return(len, &entry->budget) < 0)
+			return NULL;
+
+		skb = child->ops->dequeue(child);
+		if (unlikely(!skb))
+			return NULL;
+
+		qdisc_bstats_update(sch, skb);
+		qdisc_qstats_backlog_dec(sch, skb);
+		sch->q.qlen--;
+
+		return skb;
+	}
+
+	return NULL;
+}
+
+static bool should_restart_cycle(const struct taprio_sched *q,
+				 const struct sched_entry *entry)
+{
+	WARN_ON(!entry);
+
+	return list_is_last(&entry->list, &q->entries);
+}
+
+static enum hrtimer_restart advance_sched(struct hrtimer *timer)
+{
+	struct taprio_sched *q = container_of(timer, struct taprio_sched,
+					      advance_timer);
+	struct sched_entry *entry, *next;
+	struct Qdisc *sch = q->root;
+	ktime_t close_time;
+
+	spin_lock(&q->current_entry_lock);
+	entry = rcu_dereference_protected(q->current_entry,
+					  lockdep_is_held(&q->current_entry_lock));
+
+	/* This is the case that it's the first time that the schedule
+	 * runs, so it only happens once per schedule. The first entry
+	 * is pre-calculated during the schedule initialization.
+	 */
+	if (unlikely(!entry)) {
+		next = list_first_entry(&q->entries, struct sched_entry,
+					list);
+		close_time = next->close_time;
+		goto first_run;
+	}
+
+	if (should_restart_cycle(q, entry))
+		next = list_first_entry(&q->entries, struct sched_entry,
+					list);
+	else
+		next = list_next_entry(entry, list);
+
+	close_time = ktime_add_ns(entry->close_time, next->interval);
+
+	next->close_time = close_time;
+	atomic_set(&next->budget,
+		   (next->interval * 1000) / q->picos_per_byte);
+
+first_run:
+	rcu_assign_pointer(q->current_entry, next);
+	spin_unlock(&q->current_entry_lock);
+
+	hrtimer_set_expires(&q->advance_timer, close_time);
+
+	rcu_read_lock();
+	__netif_schedule(sch);
+	rcu_read_unlock();
+
+	return HRTIMER_RESTART;
+}
+
+static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
+	[TCA_TAPRIO_SCHED_ENTRY_INDEX]	   = { .type = NLA_U32 },
+	[TCA_TAPRIO_SCHED_ENTRY_CMD]	   = { .type = NLA_U8 },
+	[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 },
+	[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]  = { .type = NLA_U32 },
+};
+
+static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = {
+	[TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
+	[TCA_TAPRIO_ATTR_PRIOMAP]	       = {
+		.len = sizeof(struct tc_mqprio_qopt)
+	},
+	[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]     = { .type = NLA_NESTED },
+	[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]      = { .type = NLA_S64 },
+	[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]   = { .type = NLA_NESTED },
+	[TCA_TAPRIO_ATTR_SCHED_CLOCKID]        = { .type = NLA_S32 },
+};
+
+static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry,
+			    struct netlink_ext_ack *extack)
+{
+	u32 interval = 0;
+
+	if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD])
+		entry->command = nla_get_u8(
+			tb[TCA_TAPRIO_SCHED_ENTRY_CMD]);
+
+	if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK])
+		entry->gate_mask = nla_get_u32(
+			tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]);
+
+	if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL])
+		interval = nla_get_u32(
+			tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]);
+
+	if (interval == 0) {
+		NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry");
+		return -EINVAL;
+	}
+
+	entry->interval = interval;
+
+	return 0;
+}
+
+static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry,
+			     int index, struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
+	int err;
+
+	err = nla_parse_nested(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n,
+			       entry_policy, NULL);
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Could not parse nested entry");
+		return -EINVAL;
+	}
+
+	entry->index = index;
+
+	return fill_sched_entry(tb, entry, extack);
+}
+
+/* Returns the number of entries in case of success */
+static int parse_sched_single_entry(struct nlattr *n,
+				    struct taprio_sched *q,
+				    struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb_entry[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
+	struct nlattr *tb_list[TCA_TAPRIO_SCHED_MAX + 1] = { };
+	struct sched_entry *entry;
+	bool found = false;
+	u32 index;
+	int err;
+
+	err = nla_parse_nested(tb_list, TCA_TAPRIO_SCHED_MAX,
+			       n, entry_list_policy, NULL);
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Could not parse nested entry");
+		return -EINVAL;
+	}
+
+	if (!tb_list[TCA_TAPRIO_SCHED_ENTRY]) {
+		NL_SET_ERR_MSG(extack, "Single-entry must include an entry");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb_entry, TCA_TAPRIO_SCHED_ENTRY_MAX,
+			       tb_list[TCA_TAPRIO_SCHED_ENTRY],
+			       entry_policy, NULL);
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Could not parse nested entry");
+		return -EINVAL;
+	}
+
+	if (!tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]) {
+		NL_SET_ERR_MSG(extack, "Entry must specify an index\n");
+		return -EINVAL;
+	}
+
+	index = nla_get_u32(tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]);
+	if (index >= q->num_entries) {
+		NL_SET_ERR_MSG(extack, "Index for single entry exceeds number of entries in schedule");
+		return -EINVAL;
+	}
+
+	list_for_each_entry(entry, &q->entries, list) {
+		if (entry->index == index) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		NL_SET_ERR_MSG(extack, "Could not find entry");
+		return -ENOENT;
+	}
+
+	err = fill_sched_entry(tb_entry, entry, extack);
+	if (err < 0)
+		return err;
+
+	return q->num_entries;
+}
+
+static int parse_sched_list(struct nlattr *list,
+			    struct taprio_sched *q,
+			    struct netlink_ext_ack *extack)
+{
+	struct nlattr *n;
+	int err, rem;
+	int i = 0;
+
+	if (!list)
+		return -EINVAL;
+
+	nla_for_each_nested(n, list, rem) {
+		struct sched_entry *entry;
+
+		if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) {
+			NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'");
+			continue;
+		}
+
+		entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry) {
+			NL_SET_ERR_MSG(extack, "Not enough memory for entry");
+			return -ENOMEM;
+		}
+
+		err = parse_sched_entry(n, entry, i, extack);
+		if (err < 0) {
+			kfree(entry);
+			return err;
+		}
+
+		list_add_tail(&entry->list, &q->entries);
+		i++;
+	}
+
+	q->num_entries = i;
+
+	return i;
+}
+
+/* Returns the number of entries in case of success */
+static int parse_taprio_opt(struct nlattr **tb, struct taprio_sched *q,
+			    struct netlink_ext_ack *extack)
+{
+	int err = 0;
+	int clockid;
+
+	if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] &&
+	    tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY])
+		return -EINVAL;
+
+	if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] && q->num_entries == 0)
+		return -EINVAL;
+
+	if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID])
+		return -EINVAL;
+
+	if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME])
+		q->base_time = nla_get_s64(
+			tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]);
+
+	if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+		clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+
+		/* We only support static clockids and we don't allow
+		 * for it to be modified after the first init.
+		 */
+		if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid))
+			return -EINVAL;
+
+		q->clockid = clockid;
+	}
+
+	if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST])
+		err = parse_sched_list(
+			tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], q, extack);
+	else if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY])
+		err = parse_sched_single_entry(
+			tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY], q, extack);
+
+	/* parse_sched_* return the number of entries in the schedule,
+	 * a schedule with zero entries is an error.
+	 */
+	if (err == 0) {
+		NL_SET_ERR_MSG(extack, "The schedule should contain at least one entry");
+		return -EINVAL;
+	}
+
+	return err;
+}
+
+static int taprio_parse_mqprio_opt(struct net_device *dev,
+				   struct tc_mqprio_qopt *qopt,
+				   struct netlink_ext_ack *extack)
+{
+	int i, j;
+
+	if (!qopt) {
+		NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
+		return -EINVAL;
+	}
+
+	/* Verify num_tc is not out of max range */
+	if (qopt->num_tc > TC_MAX_QUEUE) {
+		NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range");
+		return -EINVAL;
+	}
+
+	/* taprio imposes that traffic classes map 1:n to tx queues */
+	if (qopt->num_tc > dev->num_tx_queues) {
+		NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues");
+		return -EINVAL;
+	}
+
+	/* Verify priority mapping uses valid tcs */
+	for (i = 0; i < TC_BITMASK + 1; i++) {
+		if (qopt->prio_tc_map[i] >= qopt->num_tc) {
+			NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping");
+			return -EINVAL;
+		}
+	}
+
+	for (i = 0; i < qopt->num_tc; i++) {
+		unsigned int last = qopt->offset[i] + qopt->count[i];
+
+		/* Verify the queue count is in tx range being equal to the
+		 * real_num_tx_queues indicates the last queue is in use.
+		 */
+		if (qopt->offset[i] >= dev->num_tx_queues ||
+		    !qopt->count[i] ||
+		    last > dev->real_num_tx_queues) {
+			NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping");
+			return -EINVAL;
+		}
+
+		/* Verify that the offset and counts do not overlap */
+		for (j = i + 1; j < qopt->num_tc; j++) {
+			if (last > qopt->offset[j]) {
+				NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping");
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static ktime_t taprio_get_start_time(struct Qdisc *sch)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct sched_entry *entry;
+	ktime_t now, base, cycle;
+	s64 n;
+
+	base = ns_to_ktime(q->base_time);
+	cycle = 0;
+
+	/* Calculate the cycle_time, by summing all the intervals.
+	 */
+	list_for_each_entry(entry, &q->entries, list)
+		cycle = ktime_add_ns(cycle, entry->interval);
+
+	if (!cycle)
+		return base;
+
+	now = q->get_time();
+
+	if (ktime_after(base, now))
+		return base;
+
+	/* Schedule the start time for the beginning of the next
+	 * cycle.
+	 */
+	n = div64_s64(ktime_sub_ns(now, base), cycle);
+
+	return ktime_add_ns(base, (n + 1) * cycle);
+}
+
+static void taprio_start_sched(struct Qdisc *sch, ktime_t start)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct sched_entry *first;
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->current_entry_lock, flags);
+
+	first = list_first_entry(&q->entries, struct sched_entry,
+				 list);
+
+	first->close_time = ktime_add_ns(start, first->interval);
+	atomic_set(&first->budget,
+		   (first->interval * 1000) / q->picos_per_byte);
+	rcu_assign_pointer(q->current_entry, NULL);
+
+	spin_unlock_irqrestore(&q->current_entry_lock, flags);
+
+	hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
+}
+
+static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
+			 struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_mqprio_qopt *mqprio = NULL;
+	struct ethtool_link_ksettings ecmd;
+	int i, err, size;
+	s64 link_speed;
+	ktime_t start;
+
+	err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt,
+			       taprio_policy, extack);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
+		mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
+
+	err = taprio_parse_mqprio_opt(dev, mqprio, extack);
+	if (err < 0)
+		return err;
+
+	/* A schedule with less than one entry is an error */
+	size = parse_taprio_opt(tb, q, extack);
+	if (size < 0)
+		return size;
+
+	hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
+	q->advance_timer.function = advance_sched;
+
+	switch (q->clockid) {
+	case CLOCK_REALTIME:
+		q->get_time = ktime_get_real;
+		break;
+	case CLOCK_MONOTONIC:
+		q->get_time = ktime_get;
+		break;
+	case CLOCK_BOOTTIME:
+		q->get_time = ktime_get_boottime;
+		break;
+	case CLOCK_TAI:
+		q->get_time = ktime_get_clocktai;
+		break;
+	default:
+		return -ENOTSUPP;
+	}
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct netdev_queue *dev_queue;
+		struct Qdisc *qdisc;
+
+		dev_queue = netdev_get_tx_queue(dev, i);
+		qdisc = qdisc_create_dflt(dev_queue,
+					  &pfifo_qdisc_ops,
+					  TC_H_MAKE(TC_H_MAJ(sch->handle),
+						    TC_H_MIN(i + 1)),
+					  extack);
+		if (!qdisc)
+			return -ENOMEM;
+
+		if (i < dev->real_num_tx_queues)
+			qdisc_hash_add(qdisc, false);
+
+		q->qdiscs[i] = qdisc;
+	}
+
+	if (mqprio) {
+		netdev_set_num_tc(dev, mqprio->num_tc);
+		for (i = 0; i < mqprio->num_tc; i++)
+			netdev_set_tc_queue(dev, i,
+					    mqprio->count[i],
+					    mqprio->offset[i]);
+
+		/* Always use supplied priority mappings */
+		for (i = 0; i < TC_BITMASK + 1; i++)
+			netdev_set_prio_tc_map(dev, i,
+					       mqprio->prio_tc_map[i]);
+	}
+
+	if (!__ethtool_get_link_ksettings(dev, &ecmd))
+		link_speed = ecmd.base.speed;
+	else
+		link_speed = SPEED_1000;
+
+	q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
+				      link_speed * 1000 * 1000);
+
+	start = taprio_get_start_time(sch);
+	if (!start)
+		return 0;
+
+	taprio_start_sched(sch, start);
+
+	return 0;
+}
+
+static void taprio_destroy(struct Qdisc *sch)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct sched_entry *entry, *n;
+	unsigned int i;
+
+	hrtimer_cancel(&q->advance_timer);
+
+	if (q->qdiscs) {
+		for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++)
+			qdisc_put(q->qdiscs[i]);
+
+		kfree(q->qdiscs);
+	}
+	q->qdiscs = NULL;
+
+	netdev_set_num_tc(dev, 0);
+
+	list_for_each_entry_safe(entry, n, &q->entries, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+}
+
+static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+
+	INIT_LIST_HEAD(&q->entries);
+	spin_lock_init(&q->current_entry_lock);
+
+	/* We may overwrite the configuration later */
+	hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS);
+
+	q->root = sch;
+
+	/* We only support static clockids. Use an invalid value as default
+	 * and get the valid one on taprio_change().
+	 */
+	q->clockid = -1;
+
+	if (sch->parent != TC_H_ROOT)
+		return -EOPNOTSUPP;
+
+	if (!netif_is_multiqueue(dev))
+		return -EOPNOTSUPP;
+
+	/* pre-allocate qdisc, attachment can't fail */
+	q->qdiscs = kcalloc(dev->num_tx_queues,
+			    sizeof(q->qdiscs[0]),
+			    GFP_KERNEL);
+
+	if (!q->qdiscs)
+		return -ENOMEM;
+
+	if (!opt)
+		return -EINVAL;
+
+	return taprio_change(sch, opt, extack);
+}
+
+static struct netdev_queue *taprio_queue_get(struct Qdisc *sch,
+					     unsigned long cl)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx = cl - 1;
+
+	if (ntx >= dev->num_tx_queues)
+		return NULL;
+
+	return netdev_get_tx_queue(dev, ntx);
+}
+
+static int taprio_graft(struct Qdisc *sch, unsigned long cl,
+			struct Qdisc *new, struct Qdisc **old,
+			struct netlink_ext_ack *extack)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+	if (!dev_queue)
+		return -EINVAL;
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+
+	*old = q->qdiscs[cl - 1];
+	q->qdiscs[cl - 1] = new;
+
+	if (new)
+		new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+
+	return 0;
+}
+
+static int dump_entry(struct sk_buff *msg,
+		      const struct sched_entry *entry)
+{
+	struct nlattr *item;
+
+	item = nla_nest_start(msg, TCA_TAPRIO_SCHED_ENTRY);
+	if (!item)
+		return -ENOSPC;
+
+	if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index))
+		goto nla_put_failure;
+
+	if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command))
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK,
+			entry->gate_mask))
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL,
+			entry->interval))
+		goto nla_put_failure;
+
+	return nla_nest_end(msg, item);
+
+nla_put_failure:
+	nla_nest_cancel(msg, item);
+	return -1;
+}
+
+static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_mqprio_qopt opt = { 0 };
+	struct nlattr *nest, *entry_list;
+	struct sched_entry *entry;
+	unsigned int i;
+
+	opt.num_tc = netdev_get_num_tc(dev);
+	memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+
+	for (i = 0; i < netdev_get_num_tc(dev); i++) {
+		opt.count[i] = dev->tc_to_txq[i].count;
+		opt.offset[i] = dev->tc_to_txq[i].offset;
+	}
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		return -ENOSPC;
+
+	if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
+		goto options_error;
+
+	if (nla_put_s64(skb, TCA_TAPRIO_ATTR_SCHED_BASE_TIME,
+			q->base_time, TCA_TAPRIO_PAD))
+		goto options_error;
+
+	if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
+		goto options_error;
+
+	entry_list = nla_nest_start(skb, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST);
+	if (!entry_list)
+		goto options_error;
+
+	list_for_each_entry(entry, &q->entries, list) {
+		if (dump_entry(skb, entry) < 0)
+			goto options_error;
+	}
+
+	nla_nest_end(skb, entry_list);
+
+	return nla_nest_end(skb, nest);
+
+options_error:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+	struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+	if (!dev_queue)
+		return NULL;
+
+	return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
+{
+	unsigned int ntx = TC_H_MIN(classid);
+
+	if (!taprio_queue_get(sch, ntx))
+		return 0;
+	return ntx;
+}
+
+static int taprio_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+	tcm->tcm_parent = TC_H_ROOT;
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+
+	return 0;
+}
+
+static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				   struct gnet_dump *d)
+	__releases(d->lock)
+	__acquires(d->lock)
+{
+	struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+	sch = dev_queue->qdisc_sleeping;
+	if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
+		return -1;
+	return 0;
+}
+
+static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx;
+
+	if (arg->stop)
+		return;
+
+	arg->count = arg->skip;
+	for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
+		if (arg->fn(sch, ntx + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static struct netdev_queue *taprio_select_queue(struct Qdisc *sch,
+						struct tcmsg *tcm)
+{
+	return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent));
+}
+
+static const struct Qdisc_class_ops taprio_class_ops = {
+	.graft		= taprio_graft,
+	.leaf		= taprio_leaf,
+	.find		= taprio_find,
+	.walk		= taprio_walk,
+	.dump		= taprio_dump_class,
+	.dump_stats	= taprio_dump_class_stats,
+	.select_queue	= taprio_select_queue,
+};
+
+static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
+	.cl_ops		= &taprio_class_ops,
+	.id		= "taprio",
+	.priv_size	= sizeof(struct taprio_sched),
+	.init		= taprio_init,
+	.destroy	= taprio_destroy,
+	.peek		= taprio_peek,
+	.dequeue	= taprio_dequeue,
+	.enqueue	= taprio_enqueue,
+	.dump		= taprio_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init taprio_module_init(void)
+{
+	return register_qdisc(&taprio_qdisc_ops);
+}
+
+static void __exit taprio_module_exit(void)
+{
+	unregister_qdisc(&taprio_qdisc_ops);
+}
+
+module_init(taprio_module_init);
+module_exit(taprio_module_exit);
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From d26d4b194e582c6f2070cc5f7f74a72124ad41ef Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 4 Oct 2018 17:07:51 -0700
Subject: net: sched: remove unused helpers

tcf_block_dev() doesn't seem to be used anywhere in the tree.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index bbfe27f86d5f..72ffb3120ced 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -65,11 +65,6 @@ static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 	return block->q;
 }
 
-static inline struct net_device *tcf_block_dev(struct tcf_block *block)
-{
-	return tcf_block_q(block)->dev_queue->dev;
-}
-
 void *tcf_block_cb_priv(struct tcf_block_cb *block_cb);
 struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
 					 tc_setup_cb_t *cb, void *cb_ident);
@@ -122,11 +117,6 @@ static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 	return NULL;
 }
 
-static inline struct net_device *tcf_block_dev(struct tcf_block *block)
-{
-	return NULL;
-}
-
 static inline
 int tc_setup_cb_block_register(struct tcf_block *block, tc_setup_cb_t *cb,
 			       void *cb_priv)
-- 
cgit v1.2.3


From 767a2217533fed696af0d06bee7746d34c4e00aa Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 4 Oct 2018 20:07:51 -0700
Subject: net: common metrics init helper for FIB entries

Consolidate initialization of ipv4 and ipv6 metrics when fib entries
are created into a single helper, ip_fib_metrics_init, that handles
the call to ip_metrics_convert.

If no metrics are defined for the fib entry, then the metrics is set
to dst_default_metrics.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h         |  4 ++--
 net/ipv4/fib_semantics.c | 27 +++++++--------------------
 net/ipv4/metrics.c       | 30 +++++++++++++++++++++++++++---
 net/ipv6/ip6_fib.c       |  2 --
 net/ipv6/route.c         | 29 +++++++----------------------
 5 files changed, 43 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index e44b1a44f67a..8cbe7e8c9e1e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -420,8 +420,8 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 	return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
 }
 
-int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
-		       u32 *metrics);
+struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
+					int fc_mx_len);
 
 u32 ip_idents_reserve(u32 hash, int segs);
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index bee8db979195..e8f4add597ed 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1020,13 +1020,6 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
 	return true;
 }
 
-static int
-fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
-{
-	return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len,
-				  fi->fib_metrics->metrics);
-}
-
 struct fib_info *fib_create_info(struct fib_config *cfg,
 				 struct netlink_ext_ack *extack)
 {
@@ -1084,16 +1077,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
 	if (!fi)
 		goto failure;
-	if (cfg->fc_mx) {
-		fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
-		if (unlikely(!fi->fib_metrics)) {
-			kfree(fi);
-			return ERR_PTR(err);
-		}
-		refcount_set(&fi->fib_metrics->refcnt, 1);
-	} else {
-		fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
+	fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
+					      cfg->fc_mx_len);
+	if (unlikely(IS_ERR(fi->fib_metrics))) {
+		err = PTR_ERR(fi->fib_metrics);
+		kfree(fi);
+		return ERR_PTR(err);
 	}
+
 	fib_info_cnt++;
 	fi->fib_net = net;
 	fi->fib_protocol = cfg->fc_protocol;
@@ -1112,10 +1103,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 			goto failure;
 	} endfor_nexthops(fi)
 
-	err = fib_convert_metrics(fi, cfg);
-	if (err)
-		goto failure;
-
 	if (cfg->fc_mp) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 04311f7067e2..6d218f5a2e71 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -5,8 +5,8 @@
 #include <net/net_namespace.h>
 #include <net/tcp.h>
 
-int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
-		       u32 *metrics)
+static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
+			      int fc_mx_len, u32 *metrics)
 {
 	bool ecn_ca = false;
 	struct nlattr *nla;
@@ -52,4 +52,28 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(ip_metrics_convert);
+
+struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
+					int fc_mx_len)
+{
+	struct dst_metrics *fib_metrics;
+	int err;
+
+	if (!fc_mx)
+		return (struct dst_metrics *)&dst_default_metrics;
+
+	fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL);
+	if (unlikely(!fib_metrics))
+		return ERR_PTR(-ENOMEM);
+
+	err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics);
+	if (!err) {
+		refcount_set(&fib_metrics->refcnt, 1);
+	} else {
+		kfree(fib_metrics);
+		fib_metrics = ERR_PTR(err);
+	}
+
+	return fib_metrics;
+}
+EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5516f55e214b..de063780a175 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -160,8 +160,6 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
 	}
 
 	INIT_LIST_HEAD(&f6i->fib6_siblings);
-	f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
-
 	atomic_inc(&f6i->fib6_ref);
 
 	return f6i;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3adf107b42d2..b62b7aa53bbe 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2705,24 +2705,6 @@ out:
 	return entries > rt_max_size;
 }
 
-static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
-			       struct fib6_config *cfg)
-{
-	struct dst_metrics *p;
-
-	if (!cfg->fc_mx)
-		return 0;
-
-	p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
-	if (unlikely(!p))
-		return -ENOMEM;
-
-	refcount_set(&p->refcnt, 1);
-	rt->fib6_metrics = p;
-
-	return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
-}
-
 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
 					    struct fib6_config *cfg,
 					    const struct in6_addr *gw_addr,
@@ -2998,13 +2980,15 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 	if (!rt)
 		goto out;
 
+	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len);
+	if (IS_ERR(rt->fib6_metrics)) {
+		err = PTR_ERR(rt->fib6_metrics);
+		goto out;
+	}
+
 	if (cfg->fc_flags & RTF_ADDRCONF)
 		rt->dst_nocount = true;
 
-	err = ip6_convert_metrics(net, rt, cfg);
-	if (err < 0)
-		goto out;
-
 	if (cfg->fc_flags & RTF_EXPIRES)
 		fib6_set_expires(rt, jiffies +
 				clock_t_to_jiffies(cfg->fc_expires));
@@ -3727,6 +3711,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
 	if (!f6i)
 		return ERR_PTR(-ENOMEM);
 
+	f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0);
 	f6i->dst_nocount = true;
 	f6i->dst_host = true;
 	f6i->fib6_protocol = RTPROT_KERNEL;
-- 
cgit v1.2.3


From cc5f0eb2164f9aa11fe631f8d905192e0233e262 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 4 Oct 2018 20:07:52 -0700
Subject: net: Move free of fib_metrics to helper

Move the refcounting and potential free of dst metrics associated
with a fib entry to a helper and use it in both ipv4 and ipv6.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h         | 6 ++++++
 net/ipv4/fib_semantics.c | 6 ++----
 net/ipv6/ip6_fib.c       | 6 ++----
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 8cbe7e8c9e1e..8fdd58ce580d 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -422,6 +422,12 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 
 struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
 					int fc_mx_len);
+static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics)
+{
+	if (fib_metrics != &dst_default_metrics &&
+	    refcount_dec_and_test(&fib_metrics->refcnt))
+		kfree(fib_metrics);
+}
 
 u32 ip_idents_reserve(u32 hash, int segs);
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e8f4add597ed..f8c7ec8171a8 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -208,7 +208,6 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
 static void free_fib_info_rcu(struct rcu_head *head)
 {
 	struct fib_info *fi = container_of(head, struct fib_info, rcu);
-	struct dst_metrics *m;
 
 	change_nexthops(fi) {
 		if (nexthop_nh->nh_dev)
@@ -219,9 +218,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
 		rt_fibinfo_free(&nexthop_nh->nh_rth_input);
 	} endfor_nexthops(fi);
 
-	m = fi->fib_metrics;
-	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
-		kfree(m);
+	ip_fib_metrics_put(fi->fib_metrics);
+
 	kfree(fi);
 }
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index de063780a175..cf709eadc932 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -29,6 +29,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 
+#include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/ndisc.h>
 #include <net/addrconf.h>
@@ -169,7 +170,6 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
 {
 	struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
 	struct rt6_exception_bucket *bucket;
-	struct dst_metrics *m;
 
 	WARN_ON(f6i->fib6_node);
 
@@ -201,9 +201,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
 	if (f6i->fib6_nh.nh_dev)
 		dev_put(f6i->fib6_nh.nh_dev);
 
-	m = f6i->fib6_metrics;
-	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
-		kfree(m);
+	ip_fib_metrics_put(f6i->fib6_metrics);
 
 	kfree(f6i);
 }
-- 
cgit v1.2.3


From e1255ed4b6dafd9966c99cde5105891cc1ac70df Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 4 Oct 2018 20:07:53 -0700
Subject: net: common metrics init helper for dst_entry

ipv4 and ipv6 both use refcounted metrics if FIB entries have metrics set.
Move the common initialization code to a helper and use for both protocols.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 12 ++++++++++++
 net/ipv4/route.c |  7 ++-----
 net/ipv6/route.c |  6 +-----
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 8fdd58ce580d..f9a7125b4bda 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -429,6 +429,18 @@ static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics)
 		kfree(fib_metrics);
 }
 
+/* ipv4 and ipv6 both use refcounted metrics if it is not the default */
+static inline
+void ip_dst_init_metrics(struct dst_entry *dst, struct dst_metrics *fib_metrics)
+{
+	dst_init_metrics(dst, fib_metrics->metrics, true);
+
+	if (fib_metrics != &dst_default_metrics) {
+		dst->_metrics |= DST_METRICS_REFCOUNTED;
+		refcount_inc(&fib_metrics->refcnt);
+	}
+}
+
 u32 ip_idents_reserve(u32 hash, int segs);
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 048919713f4e..8ccbc8f2c2cc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1528,11 +1528,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 			rt->rt_gateway = nh->nh_gw;
 			rt->rt_uses_gateway = 1;
 		}
-		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
-		if (fi->fib_metrics != &dst_default_metrics) {
-			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
-			refcount_inc(&fi->fib_metrics->refcnt);
-		}
+		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
+
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b62b7aa53bbe..b91a9d3cf288 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -978,11 +978,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 {
 	rt->rt6i_flags &= ~RTF_EXPIRES;
 	rcu_assign_pointer(rt->from, from);
-	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
-	if (from->fib6_metrics != &dst_default_metrics) {
-		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
-		refcount_inc(&from->fib6_metrics->refcnt);
-	}
+	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
 }
 
 /* Caller must already hold reference to @ort */
-- 
cgit v1.2.3


From 1620a33695d81611360d813a47ebde9386714036 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 4 Oct 2018 20:07:54 -0700
Subject: net: Move free of dst_metrics to helper

Move the refcounting and potential free of dst metrics associated
for ipv4 and ipv6 to a common helper.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 9 +++++++++
 net/ipv4/route.c | 5 +----
 net/ipv6/route.c | 5 +----
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index f9a7125b4bda..72593e171d14 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -441,6 +441,15 @@ void ip_dst_init_metrics(struct dst_entry *dst, struct dst_metrics *fib_metrics)
 	}
 }
 
+static inline
+void ip_dst_metrics_put(struct dst_entry *dst)
+{
+	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
+
+	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
+		kfree(p);
+}
+
 u32 ip_idents_reserve(u32 hash, int segs);
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8ccbc8f2c2cc..f71d2395c428 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1476,12 +1476,9 @@ void rt_del_uncached_list(struct rtable *rt)
 
 static void ipv4_dst_destroy(struct dst_entry *dst)
 {
-	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
 	struct rtable *rt = (struct rtable *)dst;
 
-	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
-		kfree(p);
-
+	ip_dst_metrics_put(dst);
 	rt_del_uncached_list(rt);
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b91a9d3cf288..6c1d817151ca 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -364,14 +364,11 @@ EXPORT_SYMBOL(ip6_dst_alloc);
 
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
-	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
 	struct rt6_info *rt = (struct rt6_info *)dst;
 	struct fib6_info *from;
 	struct inet6_dev *idev;
 
-	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
-		kfree(p);
-
+	ip_dst_metrics_put(dst);
 	rt6_uncached_list_del(rt);
 
 	idev = rt->rt6i_idev;
-- 
cgit v1.2.3


From 661b8d1b0e3a745e25f05adef2ebd00d830eeea7 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 1 Oct 2018 14:51:33 +0200
Subject: net: add umem reference in netdev{_rx}_queue

These references to the umem will be used to store information
on what kind of AF_XDP umem that is bound to a queue id, if any.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1cbbf77a685f..8318f79586c2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -609,6 +609,9 @@ struct netdev_queue {
 
 	/* Subordinate device that the queue has been assigned to */
 	struct net_device	*sb_dev;
+#ifdef CONFIG_XDP_SOCKETS
+	struct xdp_umem         *umem;
+#endif
 /*
  * write-mostly part
  */
@@ -738,6 +741,9 @@ struct netdev_rx_queue {
 	struct kobject			kobj;
 	struct net_device		*dev;
 	struct xdp_rxq_info		xdp_rxq;
+#ifdef CONFIG_XDP_SOCKETS
+	struct xdp_umem                 *umem;
+#endif
 } ____cacheline_aligned_in_smp;
 
 /*
-- 
cgit v1.2.3


From 1661d346628115c364e2b7d5b15a64ca3bd0dbd4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 1 Oct 2018 14:51:36 +0200
Subject: ethtool: don't allow disabling queues with umem installed

We already check the RSS indirection table does not use queues which
would be disabled by channel reconfiguration. Make sure user does not
try to disable queues which have a UMEM and zero-copy AF_XDP socket
installed.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h |  7 +++++++
 net/core/ethtool.c     | 11 +++++++++++
 net/xdp/xdp_umem.c     |  4 ++--
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 70a115bea4f4..13acb9803a6d 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -86,6 +86,7 @@ struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
 struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
 					  struct xdp_umem_fq_reuse *newq);
 void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
+struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
@@ -183,6 +184,12 @@ static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
 {
 }
 
+static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
+						     u16 queue_id)
+{
+	return NULL;
+}
+
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
 	return NULL;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 9a648fb09594..5a788adeba0b 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -27,6 +27,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/sched/signal.h>
 #include <linux/net.h>
+#include <net/xdp_sock.h>
 
 /*
  * Some useful ethtool_ops methods that're device independent.
@@ -1656,7 +1657,9 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 						   void __user *useraddr)
 {
 	struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS };
+	u16 from_channel, to_channel;
 	u32 max_rx_in_use = 0;
+	unsigned int i;
 
 	if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
 		return -EOPNOTSUPP;
@@ -1680,6 +1683,14 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 	    (channels.combined_count + channels.rx_count) <= max_rx_in_use)
 	    return -EINVAL;
 
+	/* Disabling channels, query zero-copy AF_XDP sockets */
+	from_channel = channels.combined_count +
+		min(channels.rx_count, channels.tx_count);
+	to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count);
+	for (i = from_channel; i < to_channel; i++)
+		if (xdp_get_umem_from_qid(dev, i))
+			return -EINVAL;
+
 	return dev->ethtool_ops->set_channels(dev, &channels);
 }
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 4d6c6652f5d1..773326f682b1 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -55,8 +55,8 @@ static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
 		dev->_tx[queue_id].umem = umem;
 }
 
-static struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
-					      u16 queue_id)
+struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
+				       u16 queue_id)
 {
 	if (queue_id < dev->real_num_rx_queues)
 		return dev->_rx[queue_id].umem;
-- 
cgit v1.2.3


From 95278ddaa15cfa23e4a06ee9ed7b6ee0197c500b Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 2 Oct 2018 12:50:19 -0700
Subject: net_sched: convert idrinfo->lock from spinlock to a mutex

In commit ec3ed293e766 ("net_sched: change tcf_del_walker() to take idrinfo->lock")
we move fl_hw_destroy_tmplt() to a workqueue to avoid blocking
with the spinlock held. Unfortunately, this causes a lot of
troubles here:

1. tcf_chain_destroy() could be called right after we queue the work
   but before the work runs. This is a use-after-free.

2. The chain refcnt is already 0, we can't even just hold it again.
   We can check refcnt==1 but it is ugly.

3. The chain with refcnt 0 is still visible in its block, which means
   it could be still found and used!

4. The block has a refcnt too, we can't hold it without introducing a
   proper API either.

We can make it working but the end result is ugly. Instead of wasting
time on reviewing it, let's just convert the troubling spinlock to
a mutex, which allows us to use non-atomic allocations too.

Fixes: ec3ed293e766 ("net_sched: change tcf_del_walker() to take idrinfo->lock")
Reported-by: Ido Schimmel <idosch@idosch.org>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Vlad Buslov <vladbu@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h  |  4 ++--
 net/sched/act_api.c    | 44 ++++++++++++++++++++++----------------------
 net/sched/cls_flower.c | 13 ++-----------
 3 files changed, 26 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 1ddff3360592..05c7df41d737 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -13,7 +13,7 @@
 #include <net/netns/generic.h>
 
 struct tcf_idrinfo {
-	spinlock_t	lock;
+	struct mutex	lock;
 	struct idr	action_idr;
 };
 
@@ -117,7 +117,7 @@ int tc_action_net_init(struct tc_action_net *tn,
 	if (!tn->idrinfo)
 		return -ENOMEM;
 	tn->ops = ops;
-	spin_lock_init(&tn->idrinfo->lock);
+	mutex_init(&tn->idrinfo->lock);
 	idr_init(&tn->idrinfo->action_idr);
 	return err;
 }
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 3c7c23421885..55153da00278 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -104,11 +104,11 @@ static int __tcf_action_put(struct tc_action *p, bool bind)
 {
 	struct tcf_idrinfo *idrinfo = p->idrinfo;
 
-	if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) {
+	if (refcount_dec_and_mutex_lock(&p->tcfa_refcnt, &idrinfo->lock)) {
 		if (bind)
 			atomic_dec(&p->tcfa_bindcnt);
 		idr_remove(&idrinfo->action_idr, p->tcfa_index);
-		spin_unlock(&idrinfo->lock);
+		mutex_unlock(&idrinfo->lock);
 
 		tcf_action_cleanup(p);
 		return 1;
@@ -200,7 +200,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 	struct tc_action *p;
 	unsigned long id = 1;
 
-	spin_lock(&idrinfo->lock);
+	mutex_lock(&idrinfo->lock);
 
 	s_i = cb->args[0];
 
@@ -235,7 +235,7 @@ done:
 	if (index >= 0)
 		cb->args[0] = index + 1;
 
-	spin_unlock(&idrinfo->lock);
+	mutex_unlock(&idrinfo->lock);
 	if (n_i) {
 		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
 			cb->args[1] = n_i;
@@ -277,18 +277,18 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 	if (nla_put_string(skb, TCA_KIND, ops->kind))
 		goto nla_put_failure;
 
-	spin_lock(&idrinfo->lock);
+	mutex_lock(&idrinfo->lock);
 	idr_for_each_entry_ul(idr, p, id) {
 		ret = tcf_idr_release_unsafe(p);
 		if (ret == ACT_P_DELETED) {
 			module_put(ops->owner);
 			n_i++;
 		} else if (ret < 0) {
-			spin_unlock(&idrinfo->lock);
+			mutex_unlock(&idrinfo->lock);
 			goto nla_put_failure;
 		}
 	}
-	spin_unlock(&idrinfo->lock);
+	mutex_unlock(&idrinfo->lock);
 
 	if (nla_put_u32(skb, TCA_FCNT, n_i))
 		goto nla_put_failure;
@@ -324,13 +324,13 @@ int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
 	struct tcf_idrinfo *idrinfo = tn->idrinfo;
 	struct tc_action *p;
 
-	spin_lock(&idrinfo->lock);
+	mutex_lock(&idrinfo->lock);
 	p = idr_find(&idrinfo->action_idr, index);
 	if (IS_ERR(p))
 		p = NULL;
 	else if (p)
 		refcount_inc(&p->tcfa_refcnt);
-	spin_unlock(&idrinfo->lock);
+	mutex_unlock(&idrinfo->lock);
 
 	if (p) {
 		*a = p;
@@ -345,10 +345,10 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
 	struct tc_action *p;
 	int ret = 0;
 
-	spin_lock(&idrinfo->lock);
+	mutex_lock(&idrinfo->lock);
 	p = idr_find(&idrinfo->action_idr, index);
 	if (!p) {
-		spin_unlock(&idrinfo->lock);
+		mutex_unlock(&idrinfo->lock);
 		return -ENOENT;
 	}
 
@@ -358,7 +358,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
 
 			WARN_ON(p != idr_remove(&idrinfo->action_idr,
 						p->tcfa_index));
-			spin_unlock(&idrinfo->lock);
+			mutex_unlock(&idrinfo->lock);
 
 			tcf_action_cleanup(p);
 			module_put(owner);
@@ -369,7 +369,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
 		ret = -EPERM;
 	}
 
-	spin_unlock(&idrinfo->lock);
+	mutex_unlock(&idrinfo->lock);
 	return ret;
 }
 
@@ -431,10 +431,10 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
 {
 	struct tcf_idrinfo *idrinfo = tn->idrinfo;
 
-	spin_lock(&idrinfo->lock);
+	mutex_lock(&idrinfo->lock);
 	/* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
 	WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index)));
-	spin_unlock(&idrinfo->lock);
+	mutex_unlock(&idrinfo->lock);
 }
 EXPORT_SYMBOL(tcf_idr_insert);
 
@@ -444,10 +444,10 @@ void tcf_idr_cleanup(struct tc_action_net *tn, u32 index)
 {
 	struct tcf_idrinfo *idrinfo = tn->idrinfo;
 
-	spin_lock(&idrinfo->lock);
+	mutex_lock(&idrinfo->lock);
 	/* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
 	WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index)));
-	spin_unlock(&idrinfo->lock);
+	mutex_unlock(&idrinfo->lock);
 }
 EXPORT_SYMBOL(tcf_idr_cleanup);
 
@@ -465,14 +465,14 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
 	int ret;
 
 again:
-	spin_lock(&idrinfo->lock);
+	mutex_lock(&idrinfo->lock);
 	if (*index) {
 		p = idr_find(&idrinfo->action_idr, *index);
 		if (IS_ERR(p)) {
 			/* This means that another process allocated
 			 * index but did not assign the pointer yet.
 			 */
-			spin_unlock(&idrinfo->lock);
+			mutex_unlock(&idrinfo->lock);
 			goto again;
 		}
 
@@ -485,7 +485,7 @@ again:
 		} else {
 			*a = NULL;
 			ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
-					    *index, GFP_ATOMIC);
+					    *index, GFP_KERNEL);
 			if (!ret)
 				idr_replace(&idrinfo->action_idr,
 					    ERR_PTR(-EBUSY), *index);
@@ -494,12 +494,12 @@ again:
 		*index = 1;
 		*a = NULL;
 		ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
-				    UINT_MAX, GFP_ATOMIC);
+				    UINT_MAX, GFP_KERNEL);
 		if (!ret)
 			idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY),
 				    *index);
 	}
-	spin_unlock(&idrinfo->lock);
+	mutex_unlock(&idrinfo->lock);
 	return ret;
 }
 EXPORT_SYMBOL(tcf_idr_check_alloc);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 92dd5071a708..9aada2d0ef06 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -79,7 +79,6 @@ struct fl_flow_tmplt {
 	struct fl_flow_key mask;
 	struct flow_dissector dissector;
 	struct tcf_chain *chain;
-	struct rcu_work rwork;
 };
 
 struct cls_fl_head {
@@ -1438,20 +1437,12 @@ errout_tb:
 	return ERR_PTR(err);
 }
 
-static void fl_tmplt_destroy_work(struct work_struct *work)
-{
-	struct fl_flow_tmplt *tmplt = container_of(to_rcu_work(work),
-						 struct fl_flow_tmplt, rwork);
-
-	fl_hw_destroy_tmplt(tmplt->chain, tmplt);
-	kfree(tmplt);
-}
-
 static void fl_tmplt_destroy(void *tmplt_priv)
 {
 	struct fl_flow_tmplt *tmplt = tmplt_priv;
 
-	tcf_queue_work(&tmplt->rwork, fl_tmplt_destroy_work);
+	fl_hw_destroy_tmplt(tmplt->chain, tmplt);
+	kfree(tmplt);
 }
 
 static int fl_dump_key_val(struct sk_buff *skb,
-- 
cgit v1.2.3


From f2e9de210d50187d206989e60bc5a99c2b692209 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 5 Oct 2018 11:31:40 -0400
Subject: udp: gro behind static key

Avoid the socket lookup cost in udp_gro_receive if no socket has a
udp tunnel callback configured.

udp_sk(sk)->gro_receive requires a registration with
setup_udp_tunnel_sock, which enables the static key.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h      | 2 ++
 net/ipv4/udp.c         | 2 +-
 net/ipv4/udp_offload.c | 2 +-
 net/ipv6/udp.c         | 2 +-
 net/ipv6/udp_offload.c | 2 +-
 5 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index 8482a990b0bb..9e82cb391dea 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -443,8 +443,10 @@ int udpv4_offload_init(void);
 
 void udp_init(void);
 
+DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
 void udp_encap_enable(void);
 #if IS_ENABLED(CONFIG_IPV6)
+DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
 void udpv6_encap_enable(void);
 #endif
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5fc4beb1c336..1bec2203d558 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1889,7 +1889,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
 void udp_encap_enable(void)
 {
 	static_branch_enable(&udp_encap_needed_key);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0c0522b79b43..802f2bc00d69 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -405,7 +405,7 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head,
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
 
-	if (unlikely(!uh))
+	if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key))
 		goto flush;
 
 	/* Don't bother verifying checksum if we're going to flush anyway. */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 28c4aa5078fc..374e7d302f26 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -548,7 +548,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
 	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
 
-static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
 void udpv6_encap_enable(void)
 {
 	static_branch_enable(&udpv6_encap_needed_key);
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 95dee9ca8d22..1b8e161ac527 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -119,7 +119,7 @@ static struct sk_buff *udp6_gro_receive(struct list_head *head,
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
 
-	if (unlikely(!uh))
+	if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key))
 		goto flush;
 
 	/* Don't bother verifying checksum if we're going to flush anyway. */
-- 
cgit v1.2.3


From 8afc978925ba0a0e8c7f76ebd0d764de0da2c0e9 Mon Sep 17 00:00:00 2001
From: Quentin Schulz <quentin.schulz@bootlin.com>
Date: Thu, 4 Oct 2018 14:22:01 +0200
Subject: net: mscc: ocelot: move the HSIO header to include/soc

Since HSIO address space can be used by different drivers (PLL, SerDes
muxing, temperature sensor), let's move it somewhere it can be included
by all drivers.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.h      |   2 +-
 drivers/net/ethernet/mscc/ocelot_hsio.h | 785 --------------------------------
 include/soc/mscc/ocelot_hsio.h          | 785 ++++++++++++++++++++++++++++++++
 3 files changed, 786 insertions(+), 786 deletions(-)
 delete mode 100644 drivers/net/ethernet/mscc/ocelot_hsio.h
 create mode 100644 include/soc/mscc/ocelot_hsio.h

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index 616bec30dfa3..d3980158c4a3 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -13,10 +13,10 @@
 #include <linux/if_vlan.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
+#include <soc/mscc/ocelot_hsio.h>
 
 #include "ocelot_ana.h"
 #include "ocelot_dev.h"
-#include "ocelot_hsio.h"
 #include "ocelot_qsys.h"
 #include "ocelot_rew.h"
 #include "ocelot_sys.h"
diff --git a/drivers/net/ethernet/mscc/ocelot_hsio.h b/drivers/net/ethernet/mscc/ocelot_hsio.h
deleted file mode 100644
index d93ddec3931b..000000000000
--- a/drivers/net/ethernet/mscc/ocelot_hsio.h
+++ /dev/null
@@ -1,785 +0,0 @@
-/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
-/*
- * Microsemi Ocelot Switch driver
- *
- * Copyright (c) 2017 Microsemi Corporation
- */
-
-#ifndef _MSCC_OCELOT_HSIO_H_
-#define _MSCC_OCELOT_HSIO_H_
-
-#define HSIO_PLL5G_CFG0_ENA_ROT                           BIT(31)
-#define HSIO_PLL5G_CFG0_ENA_LANE                          BIT(30)
-#define HSIO_PLL5G_CFG0_ENA_CLKTREE                       BIT(29)
-#define HSIO_PLL5G_CFG0_DIV4                              BIT(28)
-#define HSIO_PLL5G_CFG0_ENA_LOCK_FINE                     BIT(27)
-#define HSIO_PLL5G_CFG0_SELBGV820(x)                      (((x) << 23) & GENMASK(26, 23))
-#define HSIO_PLL5G_CFG0_SELBGV820_M                       GENMASK(26, 23)
-#define HSIO_PLL5G_CFG0_SELBGV820_X(x)                    (((x) & GENMASK(26, 23)) >> 23)
-#define HSIO_PLL5G_CFG0_LOOP_BW_RES(x)                    (((x) << 18) & GENMASK(22, 18))
-#define HSIO_PLL5G_CFG0_LOOP_BW_RES_M                     GENMASK(22, 18)
-#define HSIO_PLL5G_CFG0_LOOP_BW_RES_X(x)                  (((x) & GENMASK(22, 18)) >> 18)
-#define HSIO_PLL5G_CFG0_SELCPI(x)                         (((x) << 16) & GENMASK(17, 16))
-#define HSIO_PLL5G_CFG0_SELCPI_M                          GENMASK(17, 16)
-#define HSIO_PLL5G_CFG0_SELCPI_X(x)                       (((x) & GENMASK(17, 16)) >> 16)
-#define HSIO_PLL5G_CFG0_ENA_VCO_CONTRH                    BIT(15)
-#define HSIO_PLL5G_CFG0_ENA_CP1                           BIT(14)
-#define HSIO_PLL5G_CFG0_ENA_VCO_BUF                       BIT(13)
-#define HSIO_PLL5G_CFG0_ENA_BIAS                          BIT(12)
-#define HSIO_PLL5G_CFG0_CPU_CLK_DIV(x)                    (((x) << 6) & GENMASK(11, 6))
-#define HSIO_PLL5G_CFG0_CPU_CLK_DIV_M                     GENMASK(11, 6)
-#define HSIO_PLL5G_CFG0_CPU_CLK_DIV_X(x)                  (((x) & GENMASK(11, 6)) >> 6)
-#define HSIO_PLL5G_CFG0_CORE_CLK_DIV(x)                   ((x) & GENMASK(5, 0))
-#define HSIO_PLL5G_CFG0_CORE_CLK_DIV_M                    GENMASK(5, 0)
-
-#define HSIO_PLL5G_CFG1_ENA_DIRECT                        BIT(18)
-#define HSIO_PLL5G_CFG1_ROT_SPEED                         BIT(17)
-#define HSIO_PLL5G_CFG1_ROT_DIR                           BIT(16)
-#define HSIO_PLL5G_CFG1_READBACK_DATA_SEL                 BIT(15)
-#define HSIO_PLL5G_CFG1_RC_ENABLE                         BIT(14)
-#define HSIO_PLL5G_CFG1_RC_CTRL_DATA(x)                   (((x) << 6) & GENMASK(13, 6))
-#define HSIO_PLL5G_CFG1_RC_CTRL_DATA_M                    GENMASK(13, 6)
-#define HSIO_PLL5G_CFG1_RC_CTRL_DATA_X(x)                 (((x) & GENMASK(13, 6)) >> 6)
-#define HSIO_PLL5G_CFG1_QUARTER_RATE                      BIT(5)
-#define HSIO_PLL5G_CFG1_PWD_TX                            BIT(4)
-#define HSIO_PLL5G_CFG1_PWD_RX                            BIT(3)
-#define HSIO_PLL5G_CFG1_OUT_OF_RANGE_RECAL_ENA            BIT(2)
-#define HSIO_PLL5G_CFG1_HALF_RATE                         BIT(1)
-#define HSIO_PLL5G_CFG1_FORCE_SET_ENA                     BIT(0)
-
-#define HSIO_PLL5G_CFG2_ENA_TEST_MODE                     BIT(30)
-#define HSIO_PLL5G_CFG2_ENA_PFD_IN_FLIP                   BIT(29)
-#define HSIO_PLL5G_CFG2_ENA_VCO_NREF_TESTOUT              BIT(28)
-#define HSIO_PLL5G_CFG2_ENA_FBTESTOUT                     BIT(27)
-#define HSIO_PLL5G_CFG2_ENA_RCPLL                         BIT(26)
-#define HSIO_PLL5G_CFG2_ENA_CP2                           BIT(25)
-#define HSIO_PLL5G_CFG2_ENA_CLK_BYPASS1                   BIT(24)
-#define HSIO_PLL5G_CFG2_AMPC_SEL(x)                       (((x) << 16) & GENMASK(23, 16))
-#define HSIO_PLL5G_CFG2_AMPC_SEL_M                        GENMASK(23, 16)
-#define HSIO_PLL5G_CFG2_AMPC_SEL_X(x)                     (((x) & GENMASK(23, 16)) >> 16)
-#define HSIO_PLL5G_CFG2_ENA_CLK_BYPASS                    BIT(15)
-#define HSIO_PLL5G_CFG2_PWD_AMPCTRL_N                     BIT(14)
-#define HSIO_PLL5G_CFG2_ENA_AMPCTRL                       BIT(13)
-#define HSIO_PLL5G_CFG2_ENA_AMP_CTRL_FORCE                BIT(12)
-#define HSIO_PLL5G_CFG2_FRC_FSM_POR                       BIT(11)
-#define HSIO_PLL5G_CFG2_DISABLE_FSM_POR                   BIT(10)
-#define HSIO_PLL5G_CFG2_GAIN_TEST(x)                      (((x) << 5) & GENMASK(9, 5))
-#define HSIO_PLL5G_CFG2_GAIN_TEST_M                       GENMASK(9, 5)
-#define HSIO_PLL5G_CFG2_GAIN_TEST_X(x)                    (((x) & GENMASK(9, 5)) >> 5)
-#define HSIO_PLL5G_CFG2_EN_RESET_OVERRUN                  BIT(4)
-#define HSIO_PLL5G_CFG2_EN_RESET_LIM_DET                  BIT(3)
-#define HSIO_PLL5G_CFG2_EN_RESET_FRQ_DET                  BIT(2)
-#define HSIO_PLL5G_CFG2_DISABLE_FSM                       BIT(1)
-#define HSIO_PLL5G_CFG2_ENA_GAIN_TEST                     BIT(0)
-
-#define HSIO_PLL5G_CFG3_TEST_ANA_OUT_SEL(x)               (((x) << 22) & GENMASK(23, 22))
-#define HSIO_PLL5G_CFG3_TEST_ANA_OUT_SEL_M                GENMASK(23, 22)
-#define HSIO_PLL5G_CFG3_TEST_ANA_OUT_SEL_X(x)             (((x) & GENMASK(23, 22)) >> 22)
-#define HSIO_PLL5G_CFG3_TESTOUT_SEL(x)                    (((x) << 19) & GENMASK(21, 19))
-#define HSIO_PLL5G_CFG3_TESTOUT_SEL_M                     GENMASK(21, 19)
-#define HSIO_PLL5G_CFG3_TESTOUT_SEL_X(x)                  (((x) & GENMASK(21, 19)) >> 19)
-#define HSIO_PLL5G_CFG3_ENA_ANA_TEST_OUT                  BIT(18)
-#define HSIO_PLL5G_CFG3_ENA_TEST_OUT                      BIT(17)
-#define HSIO_PLL5G_CFG3_SEL_FBDCLK                        BIT(16)
-#define HSIO_PLL5G_CFG3_SEL_CML_CMOS_PFD                  BIT(15)
-#define HSIO_PLL5G_CFG3_RST_FB_N                          BIT(14)
-#define HSIO_PLL5G_CFG3_FORCE_VCO_CONTRH                  BIT(13)
-#define HSIO_PLL5G_CFG3_FORCE_LO                          BIT(12)
-#define HSIO_PLL5G_CFG3_FORCE_HI                          BIT(11)
-#define HSIO_PLL5G_CFG3_FORCE_ENA                         BIT(10)
-#define HSIO_PLL5G_CFG3_FORCE_CP                          BIT(9)
-#define HSIO_PLL5G_CFG3_FBDIVSEL_TST_ENA                  BIT(8)
-#define HSIO_PLL5G_CFG3_FBDIVSEL(x)                       ((x) & GENMASK(7, 0))
-#define HSIO_PLL5G_CFG3_FBDIVSEL_M                        GENMASK(7, 0)
-
-#define HSIO_PLL5G_CFG4_IB_BIAS_CTRL(x)                   (((x) << 16) & GENMASK(23, 16))
-#define HSIO_PLL5G_CFG4_IB_BIAS_CTRL_M                    GENMASK(23, 16)
-#define HSIO_PLL5G_CFG4_IB_BIAS_CTRL_X(x)                 (((x) & GENMASK(23, 16)) >> 16)
-#define HSIO_PLL5G_CFG4_IB_CTRL(x)                        ((x) & GENMASK(15, 0))
-#define HSIO_PLL5G_CFG4_IB_CTRL_M                         GENMASK(15, 0)
-
-#define HSIO_PLL5G_CFG5_OB_BIAS_CTRL(x)                   (((x) << 16) & GENMASK(23, 16))
-#define HSIO_PLL5G_CFG5_OB_BIAS_CTRL_M                    GENMASK(23, 16)
-#define HSIO_PLL5G_CFG5_OB_BIAS_CTRL_X(x)                 (((x) & GENMASK(23, 16)) >> 16)
-#define HSIO_PLL5G_CFG5_OB_CTRL(x)                        ((x) & GENMASK(15, 0))
-#define HSIO_PLL5G_CFG5_OB_CTRL_M                         GENMASK(15, 0)
-
-#define HSIO_PLL5G_CFG6_REFCLK_SEL_SRC                    BIT(23)
-#define HSIO_PLL5G_CFG6_REFCLK_SEL(x)                     (((x) << 20) & GENMASK(22, 20))
-#define HSIO_PLL5G_CFG6_REFCLK_SEL_M                      GENMASK(22, 20)
-#define HSIO_PLL5G_CFG6_REFCLK_SEL_X(x)                   (((x) & GENMASK(22, 20)) >> 20)
-#define HSIO_PLL5G_CFG6_REFCLK_SRC                        BIT(19)
-#define HSIO_PLL5G_CFG6_POR_DEL_SEL(x)                    (((x) << 16) & GENMASK(17, 16))
-#define HSIO_PLL5G_CFG6_POR_DEL_SEL_M                     GENMASK(17, 16)
-#define HSIO_PLL5G_CFG6_POR_DEL_SEL_X(x)                  (((x) & GENMASK(17, 16)) >> 16)
-#define HSIO_PLL5G_CFG6_DIV125REF_SEL(x)                  (((x) << 8) & GENMASK(15, 8))
-#define HSIO_PLL5G_CFG6_DIV125REF_SEL_M                   GENMASK(15, 8)
-#define HSIO_PLL5G_CFG6_DIV125REF_SEL_X(x)                (((x) & GENMASK(15, 8)) >> 8)
-#define HSIO_PLL5G_CFG6_ENA_REFCLKC2                      BIT(7)
-#define HSIO_PLL5G_CFG6_ENA_FBCLKC2                       BIT(6)
-#define HSIO_PLL5G_CFG6_DDR_CLK_DIV(x)                    ((x) & GENMASK(5, 0))
-#define HSIO_PLL5G_CFG6_DDR_CLK_DIV_M                     GENMASK(5, 0)
-
-#define HSIO_PLL5G_STATUS0_RANGE_LIM                      BIT(12)
-#define HSIO_PLL5G_STATUS0_OUT_OF_RANGE_ERR               BIT(11)
-#define HSIO_PLL5G_STATUS0_CALIBRATION_ERR                BIT(10)
-#define HSIO_PLL5G_STATUS0_CALIBRATION_DONE               BIT(9)
-#define HSIO_PLL5G_STATUS0_READBACK_DATA(x)               (((x) << 1) & GENMASK(8, 1))
-#define HSIO_PLL5G_STATUS0_READBACK_DATA_M                GENMASK(8, 1)
-#define HSIO_PLL5G_STATUS0_READBACK_DATA_X(x)             (((x) & GENMASK(8, 1)) >> 1)
-#define HSIO_PLL5G_STATUS0_LOCK_STATUS                    BIT(0)
-
-#define HSIO_PLL5G_STATUS1_SIG_DEL(x)                     (((x) << 21) & GENMASK(28, 21))
-#define HSIO_PLL5G_STATUS1_SIG_DEL_M                      GENMASK(28, 21)
-#define HSIO_PLL5G_STATUS1_SIG_DEL_X(x)                   (((x) & GENMASK(28, 21)) >> 21)
-#define HSIO_PLL5G_STATUS1_GAIN_STAT(x)                   (((x) << 16) & GENMASK(20, 16))
-#define HSIO_PLL5G_STATUS1_GAIN_STAT_M                    GENMASK(20, 16)
-#define HSIO_PLL5G_STATUS1_GAIN_STAT_X(x)                 (((x) & GENMASK(20, 16)) >> 16)
-#define HSIO_PLL5G_STATUS1_FBCNT_DIF(x)                   (((x) << 4) & GENMASK(13, 4))
-#define HSIO_PLL5G_STATUS1_FBCNT_DIF_M                    GENMASK(13, 4)
-#define HSIO_PLL5G_STATUS1_FBCNT_DIF_X(x)                 (((x) & GENMASK(13, 4)) >> 4)
-#define HSIO_PLL5G_STATUS1_FSM_STAT(x)                    (((x) << 1) & GENMASK(3, 1))
-#define HSIO_PLL5G_STATUS1_FSM_STAT_M                     GENMASK(3, 1)
-#define HSIO_PLL5G_STATUS1_FSM_STAT_X(x)                  (((x) & GENMASK(3, 1)) >> 1)
-#define HSIO_PLL5G_STATUS1_FSM_LOCK                       BIT(0)
-
-#define HSIO_PLL5G_BIST_CFG0_PLLB_START_BIST              BIT(31)
-#define HSIO_PLL5G_BIST_CFG0_PLLB_MEAS_MODE               BIT(30)
-#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_REPEAT(x)          (((x) << 20) & GENMASK(23, 20))
-#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_REPEAT_M           GENMASK(23, 20)
-#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_REPEAT_X(x)        (((x) & GENMASK(23, 20)) >> 20)
-#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_UNCERT(x)          (((x) << 16) & GENMASK(19, 16))
-#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_UNCERT_M           GENMASK(19, 16)
-#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_UNCERT_X(x)        (((x) & GENMASK(19, 16)) >> 16)
-#define HSIO_PLL5G_BIST_CFG0_PLLB_DIV_FACTOR_PRE(x)       ((x) & GENMASK(15, 0))
-#define HSIO_PLL5G_BIST_CFG0_PLLB_DIV_FACTOR_PRE_M        GENMASK(15, 0)
-
-#define HSIO_PLL5G_BIST_STAT0_PLLB_FSM_STAT(x)            (((x) << 4) & GENMASK(7, 4))
-#define HSIO_PLL5G_BIST_STAT0_PLLB_FSM_STAT_M             GENMASK(7, 4)
-#define HSIO_PLL5G_BIST_STAT0_PLLB_FSM_STAT_X(x)          (((x) & GENMASK(7, 4)) >> 4)
-#define HSIO_PLL5G_BIST_STAT0_PLLB_BUSY                   BIT(2)
-#define HSIO_PLL5G_BIST_STAT0_PLLB_DONE_N                 BIT(1)
-#define HSIO_PLL5G_BIST_STAT0_PLLB_FAIL                   BIT(0)
-
-#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_OUT(x)             (((x) << 16) & GENMASK(31, 16))
-#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_OUT_M              GENMASK(31, 16)
-#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_OUT_X(x)           (((x) & GENMASK(31, 16)) >> 16)
-#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_REF_DIFF(x)        ((x) & GENMASK(15, 0))
-#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_REF_DIFF_M         GENMASK(15, 0)
-
-#define HSIO_RCOMP_CFG0_PWD_ENA                           BIT(13)
-#define HSIO_RCOMP_CFG0_RUN_CAL                           BIT(12)
-#define HSIO_RCOMP_CFG0_SPEED_SEL(x)                      (((x) << 10) & GENMASK(11, 10))
-#define HSIO_RCOMP_CFG0_SPEED_SEL_M                       GENMASK(11, 10)
-#define HSIO_RCOMP_CFG0_SPEED_SEL_X(x)                    (((x) & GENMASK(11, 10)) >> 10)
-#define HSIO_RCOMP_CFG0_MODE_SEL(x)                       (((x) << 8) & GENMASK(9, 8))
-#define HSIO_RCOMP_CFG0_MODE_SEL_M                        GENMASK(9, 8)
-#define HSIO_RCOMP_CFG0_MODE_SEL_X(x)                     (((x) & GENMASK(9, 8)) >> 8)
-#define HSIO_RCOMP_CFG0_FORCE_ENA                         BIT(4)
-#define HSIO_RCOMP_CFG0_RCOMP_VAL(x)                      ((x) & GENMASK(3, 0))
-#define HSIO_RCOMP_CFG0_RCOMP_VAL_M                       GENMASK(3, 0)
-
-#define HSIO_RCOMP_STATUS_BUSY                            BIT(12)
-#define HSIO_RCOMP_STATUS_DELTA_ALERT                     BIT(7)
-#define HSIO_RCOMP_STATUS_RCOMP(x)                        ((x) & GENMASK(3, 0))
-#define HSIO_RCOMP_STATUS_RCOMP_M                         GENMASK(3, 0)
-
-#define HSIO_SYNC_ETH_CFG_RSZ                             0x4
-
-#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_SRC(x)             (((x) << 4) & GENMASK(7, 4))
-#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_SRC_M              GENMASK(7, 4)
-#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_SRC_X(x)           (((x) & GENMASK(7, 4)) >> 4)
-#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_DIV(x)             (((x) << 1) & GENMASK(3, 1))
-#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_DIV_M              GENMASK(3, 1)
-#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_DIV_X(x)           (((x) & GENMASK(3, 1)) >> 1)
-#define HSIO_SYNC_ETH_CFG_RECO_CLK_ENA                    BIT(0)
-
-#define HSIO_SYNC_ETH_PLL_CFG_PLL_AUTO_SQUELCH_ENA        BIT(0)
-
-#define HSIO_S1G_DES_CFG_DES_PHS_CTRL(x)                  (((x) << 13) & GENMASK(16, 13))
-#define HSIO_S1G_DES_CFG_DES_PHS_CTRL_M                   GENMASK(16, 13)
-#define HSIO_S1G_DES_CFG_DES_PHS_CTRL_X(x)                (((x) & GENMASK(16, 13)) >> 13)
-#define HSIO_S1G_DES_CFG_DES_CPMD_SEL(x)                  (((x) << 11) & GENMASK(12, 11))
-#define HSIO_S1G_DES_CFG_DES_CPMD_SEL_M                   GENMASK(12, 11)
-#define HSIO_S1G_DES_CFG_DES_CPMD_SEL_X(x)                (((x) & GENMASK(12, 11)) >> 11)
-#define HSIO_S1G_DES_CFG_DES_MBTR_CTRL(x)                 (((x) << 8) & GENMASK(10, 8))
-#define HSIO_S1G_DES_CFG_DES_MBTR_CTRL_M                  GENMASK(10, 8)
-#define HSIO_S1G_DES_CFG_DES_MBTR_CTRL_X(x)               (((x) & GENMASK(10, 8)) >> 8)
-#define HSIO_S1G_DES_CFG_DES_BW_ANA(x)                    (((x) << 5) & GENMASK(7, 5))
-#define HSIO_S1G_DES_CFG_DES_BW_ANA_M                     GENMASK(7, 5)
-#define HSIO_S1G_DES_CFG_DES_BW_ANA_X(x)                  (((x) & GENMASK(7, 5)) >> 5)
-#define HSIO_S1G_DES_CFG_DES_SWAP_ANA                     BIT(4)
-#define HSIO_S1G_DES_CFG_DES_BW_HYST(x)                   (((x) << 1) & GENMASK(3, 1))
-#define HSIO_S1G_DES_CFG_DES_BW_HYST_M                    GENMASK(3, 1)
-#define HSIO_S1G_DES_CFG_DES_BW_HYST_X(x)                 (((x) & GENMASK(3, 1)) >> 1)
-#define HSIO_S1G_DES_CFG_DES_SWAP_HYST                    BIT(0)
-
-#define HSIO_S1G_IB_CFG_IB_FX100_ENA                      BIT(27)
-#define HSIO_S1G_IB_CFG_ACJTAG_HYST(x)                    (((x) << 24) & GENMASK(26, 24))
-#define HSIO_S1G_IB_CFG_ACJTAG_HYST_M                     GENMASK(26, 24)
-#define HSIO_S1G_IB_CFG_ACJTAG_HYST_X(x)                  (((x) & GENMASK(26, 24)) >> 24)
-#define HSIO_S1G_IB_CFG_IB_DET_LEV(x)                     (((x) << 19) & GENMASK(21, 19))
-#define HSIO_S1G_IB_CFG_IB_DET_LEV_M                      GENMASK(21, 19)
-#define HSIO_S1G_IB_CFG_IB_DET_LEV_X(x)                   (((x) & GENMASK(21, 19)) >> 19)
-#define HSIO_S1G_IB_CFG_IB_HYST_LEV                       BIT(14)
-#define HSIO_S1G_IB_CFG_IB_ENA_CMV_TERM                   BIT(13)
-#define HSIO_S1G_IB_CFG_IB_ENA_DC_COUPLING                BIT(12)
-#define HSIO_S1G_IB_CFG_IB_ENA_DETLEV                     BIT(11)
-#define HSIO_S1G_IB_CFG_IB_ENA_HYST                       BIT(10)
-#define HSIO_S1G_IB_CFG_IB_ENA_OFFSET_COMP                BIT(9)
-#define HSIO_S1G_IB_CFG_IB_EQ_GAIN(x)                     (((x) << 6) & GENMASK(8, 6))
-#define HSIO_S1G_IB_CFG_IB_EQ_GAIN_M                      GENMASK(8, 6)
-#define HSIO_S1G_IB_CFG_IB_EQ_GAIN_X(x)                   (((x) & GENMASK(8, 6)) >> 6)
-#define HSIO_S1G_IB_CFG_IB_SEL_CORNER_FREQ(x)             (((x) << 4) & GENMASK(5, 4))
-#define HSIO_S1G_IB_CFG_IB_SEL_CORNER_FREQ_M              GENMASK(5, 4)
-#define HSIO_S1G_IB_CFG_IB_SEL_CORNER_FREQ_X(x)           (((x) & GENMASK(5, 4)) >> 4)
-#define HSIO_S1G_IB_CFG_IB_RESISTOR_CTRL(x)               ((x) & GENMASK(3, 0))
-#define HSIO_S1G_IB_CFG_IB_RESISTOR_CTRL_M                GENMASK(3, 0)
-
-#define HSIO_S1G_OB_CFG_OB_SLP(x)                         (((x) << 17) & GENMASK(18, 17))
-#define HSIO_S1G_OB_CFG_OB_SLP_M                          GENMASK(18, 17)
-#define HSIO_S1G_OB_CFG_OB_SLP_X(x)                       (((x) & GENMASK(18, 17)) >> 17)
-#define HSIO_S1G_OB_CFG_OB_AMP_CTRL(x)                    (((x) << 13) & GENMASK(16, 13))
-#define HSIO_S1G_OB_CFG_OB_AMP_CTRL_M                     GENMASK(16, 13)
-#define HSIO_S1G_OB_CFG_OB_AMP_CTRL_X(x)                  (((x) & GENMASK(16, 13)) >> 13)
-#define HSIO_S1G_OB_CFG_OB_CMM_BIAS_CTRL(x)               (((x) << 10) & GENMASK(12, 10))
-#define HSIO_S1G_OB_CFG_OB_CMM_BIAS_CTRL_M                GENMASK(12, 10)
-#define HSIO_S1G_OB_CFG_OB_CMM_BIAS_CTRL_X(x)             (((x) & GENMASK(12, 10)) >> 10)
-#define HSIO_S1G_OB_CFG_OB_DIS_VCM_CTRL                   BIT(9)
-#define HSIO_S1G_OB_CFG_OB_EN_MEAS_VREG                   BIT(8)
-#define HSIO_S1G_OB_CFG_OB_VCM_CTRL(x)                    (((x) << 4) & GENMASK(7, 4))
-#define HSIO_S1G_OB_CFG_OB_VCM_CTRL_M                     GENMASK(7, 4)
-#define HSIO_S1G_OB_CFG_OB_VCM_CTRL_X(x)                  (((x) & GENMASK(7, 4)) >> 4)
-#define HSIO_S1G_OB_CFG_OB_RESISTOR_CTRL(x)               ((x) & GENMASK(3, 0))
-#define HSIO_S1G_OB_CFG_OB_RESISTOR_CTRL_M                GENMASK(3, 0)
-
-#define HSIO_S1G_SER_CFG_SER_IDLE                         BIT(9)
-#define HSIO_S1G_SER_CFG_SER_DEEMPH                       BIT(8)
-#define HSIO_S1G_SER_CFG_SER_CPMD_SEL                     BIT(7)
-#define HSIO_S1G_SER_CFG_SER_SWAP_CPMD                    BIT(6)
-#define HSIO_S1G_SER_CFG_SER_ALISEL(x)                    (((x) << 4) & GENMASK(5, 4))
-#define HSIO_S1G_SER_CFG_SER_ALISEL_M                     GENMASK(5, 4)
-#define HSIO_S1G_SER_CFG_SER_ALISEL_X(x)                  (((x) & GENMASK(5, 4)) >> 4)
-#define HSIO_S1G_SER_CFG_SER_ENHYS                        BIT(3)
-#define HSIO_S1G_SER_CFG_SER_BIG_WIN                      BIT(2)
-#define HSIO_S1G_SER_CFG_SER_EN_WIN                       BIT(1)
-#define HSIO_S1G_SER_CFG_SER_ENALI                        BIT(0)
-
-#define HSIO_S1G_COMMON_CFG_SYS_RST                       BIT(31)
-#define HSIO_S1G_COMMON_CFG_SE_AUTO_SQUELCH_ENA           BIT(21)
-#define HSIO_S1G_COMMON_CFG_ENA_LANE                      BIT(18)
-#define HSIO_S1G_COMMON_CFG_PWD_RX                        BIT(17)
-#define HSIO_S1G_COMMON_CFG_PWD_TX                        BIT(16)
-#define HSIO_S1G_COMMON_CFG_LANE_CTRL(x)                  (((x) << 13) & GENMASK(15, 13))
-#define HSIO_S1G_COMMON_CFG_LANE_CTRL_M                   GENMASK(15, 13)
-#define HSIO_S1G_COMMON_CFG_LANE_CTRL_X(x)                (((x) & GENMASK(15, 13)) >> 13)
-#define HSIO_S1G_COMMON_CFG_ENA_DIRECT                    BIT(12)
-#define HSIO_S1G_COMMON_CFG_ENA_ELOOP                     BIT(11)
-#define HSIO_S1G_COMMON_CFG_ENA_FLOOP                     BIT(10)
-#define HSIO_S1G_COMMON_CFG_ENA_ILOOP                     BIT(9)
-#define HSIO_S1G_COMMON_CFG_ENA_PLOOP                     BIT(8)
-#define HSIO_S1G_COMMON_CFG_HRATE                         BIT(7)
-#define HSIO_S1G_COMMON_CFG_IF_MODE                       BIT(0)
-
-#define HSIO_S1G_PLL_CFG_PLL_ENA_FB_DIV2                  BIT(22)
-#define HSIO_S1G_PLL_CFG_PLL_ENA_RC_DIV2                  BIT(21)
-#define HSIO_S1G_PLL_CFG_PLL_FSM_CTRL_DATA(x)             (((x) << 8) & GENMASK(15, 8))
-#define HSIO_S1G_PLL_CFG_PLL_FSM_CTRL_DATA_M              GENMASK(15, 8)
-#define HSIO_S1G_PLL_CFG_PLL_FSM_CTRL_DATA_X(x)           (((x) & GENMASK(15, 8)) >> 8)
-#define HSIO_S1G_PLL_CFG_PLL_FSM_ENA                      BIT(7)
-#define HSIO_S1G_PLL_CFG_PLL_FSM_FORCE_SET_ENA            BIT(6)
-#define HSIO_S1G_PLL_CFG_PLL_FSM_OOR_RECAL_ENA            BIT(5)
-#define HSIO_S1G_PLL_CFG_PLL_RB_DATA_SEL                  BIT(3)
-
-#define HSIO_S1G_PLL_STATUS_PLL_CAL_NOT_DONE              BIT(12)
-#define HSIO_S1G_PLL_STATUS_PLL_CAL_ERR                   BIT(11)
-#define HSIO_S1G_PLL_STATUS_PLL_OUT_OF_RANGE_ERR          BIT(10)
-#define HSIO_S1G_PLL_STATUS_PLL_RB_DATA(x)                ((x) & GENMASK(7, 0))
-#define HSIO_S1G_PLL_STATUS_PLL_RB_DATA_M                 GENMASK(7, 0)
-
-#define HSIO_S1G_DFT_CFG0_LAZYBIT                         BIT(31)
-#define HSIO_S1G_DFT_CFG0_INV_DIS                         BIT(23)
-#define HSIO_S1G_DFT_CFG0_PRBS_SEL(x)                     (((x) << 20) & GENMASK(21, 20))
-#define HSIO_S1G_DFT_CFG0_PRBS_SEL_M                      GENMASK(21, 20)
-#define HSIO_S1G_DFT_CFG0_PRBS_SEL_X(x)                   (((x) & GENMASK(21, 20)) >> 20)
-#define HSIO_S1G_DFT_CFG0_TEST_MODE(x)                    (((x) << 16) & GENMASK(18, 16))
-#define HSIO_S1G_DFT_CFG0_TEST_MODE_M                     GENMASK(18, 16)
-#define HSIO_S1G_DFT_CFG0_TEST_MODE_X(x)                  (((x) & GENMASK(18, 16)) >> 16)
-#define HSIO_S1G_DFT_CFG0_RX_PHS_CORR_DIS                 BIT(4)
-#define HSIO_S1G_DFT_CFG0_RX_PDSENS_ENA                   BIT(3)
-#define HSIO_S1G_DFT_CFG0_RX_DFT_ENA                      BIT(2)
-#define HSIO_S1G_DFT_CFG0_TX_DFT_ENA                      BIT(0)
-
-#define HSIO_S1G_DFT_CFG1_TX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
-#define HSIO_S1G_DFT_CFG1_TX_JITTER_AMPL_M                GENMASK(17, 8)
-#define HSIO_S1G_DFT_CFG1_TX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
-#define HSIO_S1G_DFT_CFG1_TX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
-#define HSIO_S1G_DFT_CFG1_TX_STEP_FREQ_M                  GENMASK(7, 4)
-#define HSIO_S1G_DFT_CFG1_TX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
-#define HSIO_S1G_DFT_CFG1_TX_JI_ENA                       BIT(3)
-#define HSIO_S1G_DFT_CFG1_TX_WAVEFORM_SEL                 BIT(2)
-#define HSIO_S1G_DFT_CFG1_TX_FREQOFF_DIR                  BIT(1)
-#define HSIO_S1G_DFT_CFG1_TX_FREQOFF_ENA                  BIT(0)
-
-#define HSIO_S1G_DFT_CFG2_RX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
-#define HSIO_S1G_DFT_CFG2_RX_JITTER_AMPL_M                GENMASK(17, 8)
-#define HSIO_S1G_DFT_CFG2_RX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
-#define HSIO_S1G_DFT_CFG2_RX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
-#define HSIO_S1G_DFT_CFG2_RX_STEP_FREQ_M                  GENMASK(7, 4)
-#define HSIO_S1G_DFT_CFG2_RX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
-#define HSIO_S1G_DFT_CFG2_RX_JI_ENA                       BIT(3)
-#define HSIO_S1G_DFT_CFG2_RX_WAVEFORM_SEL                 BIT(2)
-#define HSIO_S1G_DFT_CFG2_RX_FREQOFF_DIR                  BIT(1)
-#define HSIO_S1G_DFT_CFG2_RX_FREQOFF_ENA                  BIT(0)
-
-#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_ENA             BIT(20)
-#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH(x)     (((x) << 16) & GENMASK(17, 16))
-#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_M      GENMASK(17, 16)
-#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_X(x)   (((x) & GENMASK(17, 16)) >> 16)
-#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_HIGH(x)         (((x) << 8) & GENMASK(15, 8))
-#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_M          GENMASK(15, 8)
-#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_X(x)       (((x) & GENMASK(15, 8)) >> 8)
-#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_LOW(x)          ((x) & GENMASK(7, 0))
-#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_LOW_M           GENMASK(7, 0)
-
-#define HSIO_S1G_MISC_CFG_DES_100FX_KICK_MODE(x)          (((x) << 11) & GENMASK(12, 11))
-#define HSIO_S1G_MISC_CFG_DES_100FX_KICK_MODE_M           GENMASK(12, 11)
-#define HSIO_S1G_MISC_CFG_DES_100FX_KICK_MODE_X(x)        (((x) & GENMASK(12, 11)) >> 11)
-#define HSIO_S1G_MISC_CFG_DES_100FX_CPMD_SWAP             BIT(10)
-#define HSIO_S1G_MISC_CFG_DES_100FX_CPMD_MODE             BIT(9)
-#define HSIO_S1G_MISC_CFG_DES_100FX_CPMD_ENA              BIT(8)
-#define HSIO_S1G_MISC_CFG_RX_LPI_MODE_ENA                 BIT(5)
-#define HSIO_S1G_MISC_CFG_TX_LPI_MODE_ENA                 BIT(4)
-#define HSIO_S1G_MISC_CFG_RX_DATA_INV_ENA                 BIT(3)
-#define HSIO_S1G_MISC_CFG_TX_DATA_INV_ENA                 BIT(2)
-#define HSIO_S1G_MISC_CFG_LANE_RST                        BIT(0)
-
-#define HSIO_S1G_DFT_STATUS_PLL_BIST_NOT_DONE             BIT(7)
-#define HSIO_S1G_DFT_STATUS_PLL_BIST_FAILED               BIT(6)
-#define HSIO_S1G_DFT_STATUS_PLL_BIST_TIMEOUT_ERR          BIT(5)
-#define HSIO_S1G_DFT_STATUS_BIST_ACTIVE                   BIT(3)
-#define HSIO_S1G_DFT_STATUS_BIST_NOSYNC                   BIT(2)
-#define HSIO_S1G_DFT_STATUS_BIST_COMPLETE_N               BIT(1)
-#define HSIO_S1G_DFT_STATUS_BIST_ERROR                    BIT(0)
-
-#define HSIO_S1G_MISC_STATUS_DES_100FX_PHASE_SEL          BIT(0)
-
-#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_WR_ONE_SHOT        BIT(31)
-#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_RD_ONE_SHOT        BIT(30)
-#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_ADDR(x)            ((x) & GENMASK(8, 0))
-#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_ADDR_M             GENMASK(8, 0)
-
-#define HSIO_S6G_DIG_CFG_GP(x)                            (((x) << 16) & GENMASK(18, 16))
-#define HSIO_S6G_DIG_CFG_GP_M                             GENMASK(18, 16)
-#define HSIO_S6G_DIG_CFG_GP_X(x)                          (((x) & GENMASK(18, 16)) >> 16)
-#define HSIO_S6G_DIG_CFG_TX_BIT_DOUBLING_MODE_ENA         BIT(7)
-#define HSIO_S6G_DIG_CFG_SIGDET_TESTMODE                  BIT(6)
-#define HSIO_S6G_DIG_CFG_SIGDET_AST(x)                    (((x) << 3) & GENMASK(5, 3))
-#define HSIO_S6G_DIG_CFG_SIGDET_AST_M                     GENMASK(5, 3)
-#define HSIO_S6G_DIG_CFG_SIGDET_AST_X(x)                  (((x) & GENMASK(5, 3)) >> 3)
-#define HSIO_S6G_DIG_CFG_SIGDET_DST(x)                    ((x) & GENMASK(2, 0))
-#define HSIO_S6G_DIG_CFG_SIGDET_DST_M                     GENMASK(2, 0)
-
-#define HSIO_S6G_DFT_CFG0_LAZYBIT                         BIT(31)
-#define HSIO_S6G_DFT_CFG0_INV_DIS                         BIT(23)
-#define HSIO_S6G_DFT_CFG0_PRBS_SEL(x)                     (((x) << 20) & GENMASK(21, 20))
-#define HSIO_S6G_DFT_CFG0_PRBS_SEL_M                      GENMASK(21, 20)
-#define HSIO_S6G_DFT_CFG0_PRBS_SEL_X(x)                   (((x) & GENMASK(21, 20)) >> 20)
-#define HSIO_S6G_DFT_CFG0_TEST_MODE(x)                    (((x) << 16) & GENMASK(18, 16))
-#define HSIO_S6G_DFT_CFG0_TEST_MODE_M                     GENMASK(18, 16)
-#define HSIO_S6G_DFT_CFG0_TEST_MODE_X(x)                  (((x) & GENMASK(18, 16)) >> 16)
-#define HSIO_S6G_DFT_CFG0_RX_PHS_CORR_DIS                 BIT(4)
-#define HSIO_S6G_DFT_CFG0_RX_PDSENS_ENA                   BIT(3)
-#define HSIO_S6G_DFT_CFG0_RX_DFT_ENA                      BIT(2)
-#define HSIO_S6G_DFT_CFG0_TX_DFT_ENA                      BIT(0)
-
-#define HSIO_S6G_DFT_CFG1_TX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
-#define HSIO_S6G_DFT_CFG1_TX_JITTER_AMPL_M                GENMASK(17, 8)
-#define HSIO_S6G_DFT_CFG1_TX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
-#define HSIO_S6G_DFT_CFG1_TX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
-#define HSIO_S6G_DFT_CFG1_TX_STEP_FREQ_M                  GENMASK(7, 4)
-#define HSIO_S6G_DFT_CFG1_TX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
-#define HSIO_S6G_DFT_CFG1_TX_JI_ENA                       BIT(3)
-#define HSIO_S6G_DFT_CFG1_TX_WAVEFORM_SEL                 BIT(2)
-#define HSIO_S6G_DFT_CFG1_TX_FREQOFF_DIR                  BIT(1)
-#define HSIO_S6G_DFT_CFG1_TX_FREQOFF_ENA                  BIT(0)
-
-#define HSIO_S6G_DFT_CFG2_RX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
-#define HSIO_S6G_DFT_CFG2_RX_JITTER_AMPL_M                GENMASK(17, 8)
-#define HSIO_S6G_DFT_CFG2_RX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
-#define HSIO_S6G_DFT_CFG2_RX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
-#define HSIO_S6G_DFT_CFG2_RX_STEP_FREQ_M                  GENMASK(7, 4)
-#define HSIO_S6G_DFT_CFG2_RX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
-#define HSIO_S6G_DFT_CFG2_RX_JI_ENA                       BIT(3)
-#define HSIO_S6G_DFT_CFG2_RX_WAVEFORM_SEL                 BIT(2)
-#define HSIO_S6G_DFT_CFG2_RX_FREQOFF_DIR                  BIT(1)
-#define HSIO_S6G_DFT_CFG2_RX_FREQOFF_ENA                  BIT(0)
-
-#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_ENA             BIT(20)
-#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH(x)     (((x) << 16) & GENMASK(19, 16))
-#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_M      GENMASK(19, 16)
-#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_X(x)   (((x) & GENMASK(19, 16)) >> 16)
-#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_HIGH(x)         (((x) << 8) & GENMASK(15, 8))
-#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_M          GENMASK(15, 8)
-#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_X(x)       (((x) & GENMASK(15, 8)) >> 8)
-#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_LOW(x)          ((x) & GENMASK(7, 0))
-#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_LOW_M           GENMASK(7, 0)
-
-#define HSIO_S6G_MISC_CFG_SEL_RECO_CLK(x)                 (((x) << 13) & GENMASK(14, 13))
-#define HSIO_S6G_MISC_CFG_SEL_RECO_CLK_M                  GENMASK(14, 13)
-#define HSIO_S6G_MISC_CFG_SEL_RECO_CLK_X(x)               (((x) & GENMASK(14, 13)) >> 13)
-#define HSIO_S6G_MISC_CFG_DES_100FX_KICK_MODE(x)          (((x) << 11) & GENMASK(12, 11))
-#define HSIO_S6G_MISC_CFG_DES_100FX_KICK_MODE_M           GENMASK(12, 11)
-#define HSIO_S6G_MISC_CFG_DES_100FX_KICK_MODE_X(x)        (((x) & GENMASK(12, 11)) >> 11)
-#define HSIO_S6G_MISC_CFG_DES_100FX_CPMD_SWAP             BIT(10)
-#define HSIO_S6G_MISC_CFG_DES_100FX_CPMD_MODE             BIT(9)
-#define HSIO_S6G_MISC_CFG_DES_100FX_CPMD_ENA              BIT(8)
-#define HSIO_S6G_MISC_CFG_RX_BUS_FLIP_ENA                 BIT(7)
-#define HSIO_S6G_MISC_CFG_TX_BUS_FLIP_ENA                 BIT(6)
-#define HSIO_S6G_MISC_CFG_RX_LPI_MODE_ENA                 BIT(5)
-#define HSIO_S6G_MISC_CFG_TX_LPI_MODE_ENA                 BIT(4)
-#define HSIO_S6G_MISC_CFG_RX_DATA_INV_ENA                 BIT(3)
-#define HSIO_S6G_MISC_CFG_TX_DATA_INV_ENA                 BIT(2)
-#define HSIO_S6G_MISC_CFG_LANE_RST                        BIT(0)
-
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST0(x)               (((x) << 23) & GENMASK(28, 23))
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST0_M                GENMASK(28, 23)
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST0_X(x)             (((x) & GENMASK(28, 23)) >> 23)
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST1(x)               (((x) << 18) & GENMASK(22, 18))
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST1_M                GENMASK(22, 18)
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST1_X(x)             (((x) & GENMASK(22, 18)) >> 18)
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_PREC(x)                (((x) << 13) & GENMASK(17, 13))
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_PREC_M                 GENMASK(17, 13)
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_PREC_X(x)              (((x) & GENMASK(17, 13)) >> 13)
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_ENA_CAS(x)             (((x) << 6) & GENMASK(8, 6))
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_ENA_CAS_M              GENMASK(8, 6)
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_ENA_CAS_X(x)           (((x) & GENMASK(8, 6)) >> 6)
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_LEV(x)                 ((x) & GENMASK(5, 0))
-#define HSIO_S6G_OB_ANEG_CFG_AN_OB_LEV_M                  GENMASK(5, 0)
-
-#define HSIO_S6G_DFT_STATUS_PRBS_SYNC_STAT                BIT(8)
-#define HSIO_S6G_DFT_STATUS_PLL_BIST_NOT_DONE             BIT(7)
-#define HSIO_S6G_DFT_STATUS_PLL_BIST_FAILED               BIT(6)
-#define HSIO_S6G_DFT_STATUS_PLL_BIST_TIMEOUT_ERR          BIT(5)
-#define HSIO_S6G_DFT_STATUS_BIST_ACTIVE                   BIT(3)
-#define HSIO_S6G_DFT_STATUS_BIST_NOSYNC                   BIT(2)
-#define HSIO_S6G_DFT_STATUS_BIST_COMPLETE_N               BIT(1)
-#define HSIO_S6G_DFT_STATUS_BIST_ERROR                    BIT(0)
-
-#define HSIO_S6G_MISC_STATUS_DES_100FX_PHASE_SEL          BIT(0)
-
-#define HSIO_S6G_DES_CFG_DES_PHS_CTRL(x)                  (((x) << 13) & GENMASK(16, 13))
-#define HSIO_S6G_DES_CFG_DES_PHS_CTRL_M                   GENMASK(16, 13)
-#define HSIO_S6G_DES_CFG_DES_PHS_CTRL_X(x)                (((x) & GENMASK(16, 13)) >> 13)
-#define HSIO_S6G_DES_CFG_DES_MBTR_CTRL(x)                 (((x) << 10) & GENMASK(12, 10))
-#define HSIO_S6G_DES_CFG_DES_MBTR_CTRL_M                  GENMASK(12, 10)
-#define HSIO_S6G_DES_CFG_DES_MBTR_CTRL_X(x)               (((x) & GENMASK(12, 10)) >> 10)
-#define HSIO_S6G_DES_CFG_DES_CPMD_SEL(x)                  (((x) << 8) & GENMASK(9, 8))
-#define HSIO_S6G_DES_CFG_DES_CPMD_SEL_M                   GENMASK(9, 8)
-#define HSIO_S6G_DES_CFG_DES_CPMD_SEL_X(x)                (((x) & GENMASK(9, 8)) >> 8)
-#define HSIO_S6G_DES_CFG_DES_BW_HYST(x)                   (((x) << 5) & GENMASK(7, 5))
-#define HSIO_S6G_DES_CFG_DES_BW_HYST_M                    GENMASK(7, 5)
-#define HSIO_S6G_DES_CFG_DES_BW_HYST_X(x)                 (((x) & GENMASK(7, 5)) >> 5)
-#define HSIO_S6G_DES_CFG_DES_SWAP_HYST                    BIT(4)
-#define HSIO_S6G_DES_CFG_DES_BW_ANA(x)                    (((x) << 1) & GENMASK(3, 1))
-#define HSIO_S6G_DES_CFG_DES_BW_ANA_M                     GENMASK(3, 1)
-#define HSIO_S6G_DES_CFG_DES_BW_ANA_X(x)                  (((x) & GENMASK(3, 1)) >> 1)
-#define HSIO_S6G_DES_CFG_DES_SWAP_ANA                     BIT(0)
-
-#define HSIO_S6G_IB_CFG_IB_SOFSI(x)                       (((x) << 29) & GENMASK(30, 29))
-#define HSIO_S6G_IB_CFG_IB_SOFSI_M                        GENMASK(30, 29)
-#define HSIO_S6G_IB_CFG_IB_SOFSI_X(x)                     (((x) & GENMASK(30, 29)) >> 29)
-#define HSIO_S6G_IB_CFG_IB_VBULK_SEL                      BIT(28)
-#define HSIO_S6G_IB_CFG_IB_RTRM_ADJ(x)                    (((x) << 24) & GENMASK(27, 24))
-#define HSIO_S6G_IB_CFG_IB_RTRM_ADJ_M                     GENMASK(27, 24)
-#define HSIO_S6G_IB_CFG_IB_RTRM_ADJ_X(x)                  (((x) & GENMASK(27, 24)) >> 24)
-#define HSIO_S6G_IB_CFG_IB_ICML_ADJ(x)                    (((x) << 20) & GENMASK(23, 20))
-#define HSIO_S6G_IB_CFG_IB_ICML_ADJ_M                     GENMASK(23, 20)
-#define HSIO_S6G_IB_CFG_IB_ICML_ADJ_X(x)                  (((x) & GENMASK(23, 20)) >> 20)
-#define HSIO_S6G_IB_CFG_IB_TERM_MODE_SEL(x)               (((x) << 18) & GENMASK(19, 18))
-#define HSIO_S6G_IB_CFG_IB_TERM_MODE_SEL_M                GENMASK(19, 18)
-#define HSIO_S6G_IB_CFG_IB_TERM_MODE_SEL_X(x)             (((x) & GENMASK(19, 18)) >> 18)
-#define HSIO_S6G_IB_CFG_IB_SIG_DET_CLK_SEL(x)             (((x) << 15) & GENMASK(17, 15))
-#define HSIO_S6G_IB_CFG_IB_SIG_DET_CLK_SEL_M              GENMASK(17, 15)
-#define HSIO_S6G_IB_CFG_IB_SIG_DET_CLK_SEL_X(x)           (((x) & GENMASK(17, 15)) >> 15)
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_HP(x)              (((x) << 13) & GENMASK(14, 13))
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_HP_M               GENMASK(14, 13)
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_HP_X(x)            (((x) & GENMASK(14, 13)) >> 13)
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_MID(x)             (((x) << 11) & GENMASK(12, 11))
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_MID_M              GENMASK(12, 11)
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_MID_X(x)           (((x) & GENMASK(12, 11)) >> 11)
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_LP(x)              (((x) << 9) & GENMASK(10, 9))
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_LP_M               GENMASK(10, 9)
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_LP_X(x)            (((x) & GENMASK(10, 9)) >> 9)
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_OFFSET(x)          (((x) << 7) & GENMASK(8, 7))
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_OFFSET_M           GENMASK(8, 7)
-#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_OFFSET_X(x)        (((x) & GENMASK(8, 7)) >> 7)
-#define HSIO_S6G_IB_CFG_IB_ANA_TEST_ENA                   BIT(6)
-#define HSIO_S6G_IB_CFG_IB_SIG_DET_ENA                    BIT(5)
-#define HSIO_S6G_IB_CFG_IB_CONCUR                         BIT(4)
-#define HSIO_S6G_IB_CFG_IB_CAL_ENA                        BIT(3)
-#define HSIO_S6G_IB_CFG_IB_SAM_ENA                        BIT(2)
-#define HSIO_S6G_IB_CFG_IB_EQZ_ENA                        BIT(1)
-#define HSIO_S6G_IB_CFG_IB_REG_ENA                        BIT(0)
-
-#define HSIO_S6G_IB_CFG1_IB_TJTAG(x)                      (((x) << 17) & GENMASK(21, 17))
-#define HSIO_S6G_IB_CFG1_IB_TJTAG_M                       GENMASK(21, 17)
-#define HSIO_S6G_IB_CFG1_IB_TJTAG_X(x)                    (((x) & GENMASK(21, 17)) >> 17)
-#define HSIO_S6G_IB_CFG1_IB_TSDET(x)                      (((x) << 12) & GENMASK(16, 12))
-#define HSIO_S6G_IB_CFG1_IB_TSDET_M                       GENMASK(16, 12)
-#define HSIO_S6G_IB_CFG1_IB_TSDET_X(x)                    (((x) & GENMASK(16, 12)) >> 12)
-#define HSIO_S6G_IB_CFG1_IB_SCALY(x)                      (((x) << 8) & GENMASK(11, 8))
-#define HSIO_S6G_IB_CFG1_IB_SCALY_M                       GENMASK(11, 8)
-#define HSIO_S6G_IB_CFG1_IB_SCALY_X(x)                    (((x) & GENMASK(11, 8)) >> 8)
-#define HSIO_S6G_IB_CFG1_IB_FILT_HP                       BIT(7)
-#define HSIO_S6G_IB_CFG1_IB_FILT_MID                      BIT(6)
-#define HSIO_S6G_IB_CFG1_IB_FILT_LP                       BIT(5)
-#define HSIO_S6G_IB_CFG1_IB_FILT_OFFSET                   BIT(4)
-#define HSIO_S6G_IB_CFG1_IB_FRC_HP                        BIT(3)
-#define HSIO_S6G_IB_CFG1_IB_FRC_MID                       BIT(2)
-#define HSIO_S6G_IB_CFG1_IB_FRC_LP                        BIT(1)
-#define HSIO_S6G_IB_CFG1_IB_FRC_OFFSET                    BIT(0)
-
-#define HSIO_S6G_IB_CFG2_IB_TINFV(x)                      (((x) << 27) & GENMASK(29, 27))
-#define HSIO_S6G_IB_CFG2_IB_TINFV_M                       GENMASK(29, 27)
-#define HSIO_S6G_IB_CFG2_IB_TINFV_X(x)                    (((x) & GENMASK(29, 27)) >> 27)
-#define HSIO_S6G_IB_CFG2_IB_OINFI(x)                      (((x) << 22) & GENMASK(26, 22))
-#define HSIO_S6G_IB_CFG2_IB_OINFI_M                       GENMASK(26, 22)
-#define HSIO_S6G_IB_CFG2_IB_OINFI_X(x)                    (((x) & GENMASK(26, 22)) >> 22)
-#define HSIO_S6G_IB_CFG2_IB_TAUX(x)                       (((x) << 19) & GENMASK(21, 19))
-#define HSIO_S6G_IB_CFG2_IB_TAUX_M                        GENMASK(21, 19)
-#define HSIO_S6G_IB_CFG2_IB_TAUX_X(x)                     (((x) & GENMASK(21, 19)) >> 19)
-#define HSIO_S6G_IB_CFG2_IB_OINFS(x)                      (((x) << 16) & GENMASK(18, 16))
-#define HSIO_S6G_IB_CFG2_IB_OINFS_M                       GENMASK(18, 16)
-#define HSIO_S6G_IB_CFG2_IB_OINFS_X(x)                    (((x) & GENMASK(18, 16)) >> 16)
-#define HSIO_S6G_IB_CFG2_IB_OCALS(x)                      (((x) << 10) & GENMASK(15, 10))
-#define HSIO_S6G_IB_CFG2_IB_OCALS_M                       GENMASK(15, 10)
-#define HSIO_S6G_IB_CFG2_IB_OCALS_X(x)                    (((x) & GENMASK(15, 10)) >> 10)
-#define HSIO_S6G_IB_CFG2_IB_TCALV(x)                      (((x) << 5) & GENMASK(9, 5))
-#define HSIO_S6G_IB_CFG2_IB_TCALV_M                       GENMASK(9, 5)
-#define HSIO_S6G_IB_CFG2_IB_TCALV_X(x)                    (((x) & GENMASK(9, 5)) >> 5)
-#define HSIO_S6G_IB_CFG2_IB_UMAX(x)                       (((x) << 3) & GENMASK(4, 3))
-#define HSIO_S6G_IB_CFG2_IB_UMAX_M                        GENMASK(4, 3)
-#define HSIO_S6G_IB_CFG2_IB_UMAX_X(x)                     (((x) & GENMASK(4, 3)) >> 3)
-#define HSIO_S6G_IB_CFG2_IB_UREG(x)                       ((x) & GENMASK(2, 0))
-#define HSIO_S6G_IB_CFG2_IB_UREG_M                        GENMASK(2, 0)
-
-#define HSIO_S6G_IB_CFG3_IB_INI_HP(x)                     (((x) << 18) & GENMASK(23, 18))
-#define HSIO_S6G_IB_CFG3_IB_INI_HP_M                      GENMASK(23, 18)
-#define HSIO_S6G_IB_CFG3_IB_INI_HP_X(x)                   (((x) & GENMASK(23, 18)) >> 18)
-#define HSIO_S6G_IB_CFG3_IB_INI_MID(x)                    (((x) << 12) & GENMASK(17, 12))
-#define HSIO_S6G_IB_CFG3_IB_INI_MID_M                     GENMASK(17, 12)
-#define HSIO_S6G_IB_CFG3_IB_INI_MID_X(x)                  (((x) & GENMASK(17, 12)) >> 12)
-#define HSIO_S6G_IB_CFG3_IB_INI_LP(x)                     (((x) << 6) & GENMASK(11, 6))
-#define HSIO_S6G_IB_CFG3_IB_INI_LP_M                      GENMASK(11, 6)
-#define HSIO_S6G_IB_CFG3_IB_INI_LP_X(x)                   (((x) & GENMASK(11, 6)) >> 6)
-#define HSIO_S6G_IB_CFG3_IB_INI_OFFSET(x)                 ((x) & GENMASK(5, 0))
-#define HSIO_S6G_IB_CFG3_IB_INI_OFFSET_M                  GENMASK(5, 0)
-
-#define HSIO_S6G_IB_CFG4_IB_MAX_HP(x)                     (((x) << 18) & GENMASK(23, 18))
-#define HSIO_S6G_IB_CFG4_IB_MAX_HP_M                      GENMASK(23, 18)
-#define HSIO_S6G_IB_CFG4_IB_MAX_HP_X(x)                   (((x) & GENMASK(23, 18)) >> 18)
-#define HSIO_S6G_IB_CFG4_IB_MAX_MID(x)                    (((x) << 12) & GENMASK(17, 12))
-#define HSIO_S6G_IB_CFG4_IB_MAX_MID_M                     GENMASK(17, 12)
-#define HSIO_S6G_IB_CFG4_IB_MAX_MID_X(x)                  (((x) & GENMASK(17, 12)) >> 12)
-#define HSIO_S6G_IB_CFG4_IB_MAX_LP(x)                     (((x) << 6) & GENMASK(11, 6))
-#define HSIO_S6G_IB_CFG4_IB_MAX_LP_M                      GENMASK(11, 6)
-#define HSIO_S6G_IB_CFG4_IB_MAX_LP_X(x)                   (((x) & GENMASK(11, 6)) >> 6)
-#define HSIO_S6G_IB_CFG4_IB_MAX_OFFSET(x)                 ((x) & GENMASK(5, 0))
-#define HSIO_S6G_IB_CFG4_IB_MAX_OFFSET_M                  GENMASK(5, 0)
-
-#define HSIO_S6G_IB_CFG5_IB_MIN_HP(x)                     (((x) << 18) & GENMASK(23, 18))
-#define HSIO_S6G_IB_CFG5_IB_MIN_HP_M                      GENMASK(23, 18)
-#define HSIO_S6G_IB_CFG5_IB_MIN_HP_X(x)                   (((x) & GENMASK(23, 18)) >> 18)
-#define HSIO_S6G_IB_CFG5_IB_MIN_MID(x)                    (((x) << 12) & GENMASK(17, 12))
-#define HSIO_S6G_IB_CFG5_IB_MIN_MID_M                     GENMASK(17, 12)
-#define HSIO_S6G_IB_CFG5_IB_MIN_MID_X(x)                  (((x) & GENMASK(17, 12)) >> 12)
-#define HSIO_S6G_IB_CFG5_IB_MIN_LP(x)                     (((x) << 6) & GENMASK(11, 6))
-#define HSIO_S6G_IB_CFG5_IB_MIN_LP_M                      GENMASK(11, 6)
-#define HSIO_S6G_IB_CFG5_IB_MIN_LP_X(x)                   (((x) & GENMASK(11, 6)) >> 6)
-#define HSIO_S6G_IB_CFG5_IB_MIN_OFFSET(x)                 ((x) & GENMASK(5, 0))
-#define HSIO_S6G_IB_CFG5_IB_MIN_OFFSET_M                  GENMASK(5, 0)
-
-#define HSIO_S6G_OB_CFG_OB_IDLE                           BIT(31)
-#define HSIO_S6G_OB_CFG_OB_ENA1V_MODE                     BIT(30)
-#define HSIO_S6G_OB_CFG_OB_POL                            BIT(29)
-#define HSIO_S6G_OB_CFG_OB_POST0(x)                       (((x) << 23) & GENMASK(28, 23))
-#define HSIO_S6G_OB_CFG_OB_POST0_M                        GENMASK(28, 23)
-#define HSIO_S6G_OB_CFG_OB_POST0_X(x)                     (((x) & GENMASK(28, 23)) >> 23)
-#define HSIO_S6G_OB_CFG_OB_PREC(x)                        (((x) << 18) & GENMASK(22, 18))
-#define HSIO_S6G_OB_CFG_OB_PREC_M                         GENMASK(22, 18)
-#define HSIO_S6G_OB_CFG_OB_PREC_X(x)                      (((x) & GENMASK(22, 18)) >> 18)
-#define HSIO_S6G_OB_CFG_OB_R_ADJ_MUX                      BIT(17)
-#define HSIO_S6G_OB_CFG_OB_R_ADJ_PDR                      BIT(16)
-#define HSIO_S6G_OB_CFG_OB_POST1(x)                       (((x) << 11) & GENMASK(15, 11))
-#define HSIO_S6G_OB_CFG_OB_POST1_M                        GENMASK(15, 11)
-#define HSIO_S6G_OB_CFG_OB_POST1_X(x)                     (((x) & GENMASK(15, 11)) >> 11)
-#define HSIO_S6G_OB_CFG_OB_R_COR                          BIT(10)
-#define HSIO_S6G_OB_CFG_OB_SEL_RCTRL                      BIT(9)
-#define HSIO_S6G_OB_CFG_OB_SR_H                           BIT(8)
-#define HSIO_S6G_OB_CFG_OB_SR(x)                          (((x) << 4) & GENMASK(7, 4))
-#define HSIO_S6G_OB_CFG_OB_SR_M                           GENMASK(7, 4)
-#define HSIO_S6G_OB_CFG_OB_SR_X(x)                        (((x) & GENMASK(7, 4)) >> 4)
-#define HSIO_S6G_OB_CFG_OB_RESISTOR_CTRL(x)               ((x) & GENMASK(3, 0))
-#define HSIO_S6G_OB_CFG_OB_RESISTOR_CTRL_M                GENMASK(3, 0)
-
-#define HSIO_S6G_OB_CFG1_OB_ENA_CAS(x)                    (((x) << 6) & GENMASK(8, 6))
-#define HSIO_S6G_OB_CFG1_OB_ENA_CAS_M                     GENMASK(8, 6)
-#define HSIO_S6G_OB_CFG1_OB_ENA_CAS_X(x)                  (((x) & GENMASK(8, 6)) >> 6)
-#define HSIO_S6G_OB_CFG1_OB_LEV(x)                        ((x) & GENMASK(5, 0))
-#define HSIO_S6G_OB_CFG1_OB_LEV_M                         GENMASK(5, 0)
-
-#define HSIO_S6G_SER_CFG_SER_4TAP_ENA                     BIT(8)
-#define HSIO_S6G_SER_CFG_SER_CPMD_SEL                     BIT(7)
-#define HSIO_S6G_SER_CFG_SER_SWAP_CPMD                    BIT(6)
-#define HSIO_S6G_SER_CFG_SER_ALISEL(x)                    (((x) << 4) & GENMASK(5, 4))
-#define HSIO_S6G_SER_CFG_SER_ALISEL_M                     GENMASK(5, 4)
-#define HSIO_S6G_SER_CFG_SER_ALISEL_X(x)                  (((x) & GENMASK(5, 4)) >> 4)
-#define HSIO_S6G_SER_CFG_SER_ENHYS                        BIT(3)
-#define HSIO_S6G_SER_CFG_SER_BIG_WIN                      BIT(2)
-#define HSIO_S6G_SER_CFG_SER_EN_WIN                       BIT(1)
-#define HSIO_S6G_SER_CFG_SER_ENALI                        BIT(0)
-
-#define HSIO_S6G_COMMON_CFG_SYS_RST                       BIT(17)
-#define HSIO_S6G_COMMON_CFG_SE_DIV2_ENA                   BIT(16)
-#define HSIO_S6G_COMMON_CFG_SE_AUTO_SQUELCH_ENA           BIT(15)
-#define HSIO_S6G_COMMON_CFG_ENA_LANE                      BIT(14)
-#define HSIO_S6G_COMMON_CFG_PWD_RX                        BIT(13)
-#define HSIO_S6G_COMMON_CFG_PWD_TX                        BIT(12)
-#define HSIO_S6G_COMMON_CFG_LANE_CTRL(x)                  (((x) << 9) & GENMASK(11, 9))
-#define HSIO_S6G_COMMON_CFG_LANE_CTRL_M                   GENMASK(11, 9)
-#define HSIO_S6G_COMMON_CFG_LANE_CTRL_X(x)                (((x) & GENMASK(11, 9)) >> 9)
-#define HSIO_S6G_COMMON_CFG_ENA_DIRECT                    BIT(8)
-#define HSIO_S6G_COMMON_CFG_ENA_ELOOP                     BIT(7)
-#define HSIO_S6G_COMMON_CFG_ENA_FLOOP                     BIT(6)
-#define HSIO_S6G_COMMON_CFG_ENA_ILOOP                     BIT(5)
-#define HSIO_S6G_COMMON_CFG_ENA_PLOOP                     BIT(4)
-#define HSIO_S6G_COMMON_CFG_HRATE                         BIT(3)
-#define HSIO_S6G_COMMON_CFG_QRATE                         BIT(2)
-#define HSIO_S6G_COMMON_CFG_IF_MODE(x)                    ((x) & GENMASK(1, 0))
-#define HSIO_S6G_COMMON_CFG_IF_MODE_M                     GENMASK(1, 0)
-
-#define HSIO_S6G_PLL_CFG_PLL_ENA_OFFS(x)                  (((x) << 16) & GENMASK(17, 16))
-#define HSIO_S6G_PLL_CFG_PLL_ENA_OFFS_M                   GENMASK(17, 16)
-#define HSIO_S6G_PLL_CFG_PLL_ENA_OFFS_X(x)                (((x) & GENMASK(17, 16)) >> 16)
-#define HSIO_S6G_PLL_CFG_PLL_DIV4                         BIT(15)
-#define HSIO_S6G_PLL_CFG_PLL_ENA_ROT                      BIT(14)
-#define HSIO_S6G_PLL_CFG_PLL_FSM_CTRL_DATA(x)             (((x) << 6) & GENMASK(13, 6))
-#define HSIO_S6G_PLL_CFG_PLL_FSM_CTRL_DATA_M              GENMASK(13, 6)
-#define HSIO_S6G_PLL_CFG_PLL_FSM_CTRL_DATA_X(x)           (((x) & GENMASK(13, 6)) >> 6)
-#define HSIO_S6G_PLL_CFG_PLL_FSM_ENA                      BIT(5)
-#define HSIO_S6G_PLL_CFG_PLL_FSM_FORCE_SET_ENA            BIT(4)
-#define HSIO_S6G_PLL_CFG_PLL_FSM_OOR_RECAL_ENA            BIT(3)
-#define HSIO_S6G_PLL_CFG_PLL_RB_DATA_SEL                  BIT(2)
-#define HSIO_S6G_PLL_CFG_PLL_ROT_DIR                      BIT(1)
-#define HSIO_S6G_PLL_CFG_PLL_ROT_FRQ                      BIT(0)
-
-#define HSIO_S6G_ACJTAG_CFG_ACJTAG_INIT_DATA_N            BIT(5)
-#define HSIO_S6G_ACJTAG_CFG_ACJTAG_INIT_DATA_P            BIT(4)
-#define HSIO_S6G_ACJTAG_CFG_ACJTAG_INIT_CLK               BIT(3)
-#define HSIO_S6G_ACJTAG_CFG_OB_DIRECT                     BIT(2)
-#define HSIO_S6G_ACJTAG_CFG_ACJTAG_ENA                    BIT(1)
-#define HSIO_S6G_ACJTAG_CFG_JTAG_CTRL_ENA                 BIT(0)
-
-#define HSIO_S6G_GP_CFG_GP_MSB(x)                         (((x) << 16) & GENMASK(31, 16))
-#define HSIO_S6G_GP_CFG_GP_MSB_M                          GENMASK(31, 16)
-#define HSIO_S6G_GP_CFG_GP_MSB_X(x)                       (((x) & GENMASK(31, 16)) >> 16)
-#define HSIO_S6G_GP_CFG_GP_LSB(x)                         ((x) & GENMASK(15, 0))
-#define HSIO_S6G_GP_CFG_GP_LSB_M                          GENMASK(15, 0)
-
-#define HSIO_S6G_IB_STATUS0_IB_CAL_DONE                   BIT(8)
-#define HSIO_S6G_IB_STATUS0_IB_HP_GAIN_ACT                BIT(7)
-#define HSIO_S6G_IB_STATUS0_IB_MID_GAIN_ACT               BIT(6)
-#define HSIO_S6G_IB_STATUS0_IB_LP_GAIN_ACT                BIT(5)
-#define HSIO_S6G_IB_STATUS0_IB_OFFSET_ACT                 BIT(4)
-#define HSIO_S6G_IB_STATUS0_IB_OFFSET_VLD                 BIT(3)
-#define HSIO_S6G_IB_STATUS0_IB_OFFSET_ERR                 BIT(2)
-#define HSIO_S6G_IB_STATUS0_IB_OFFSDIR                    BIT(1)
-#define HSIO_S6G_IB_STATUS0_IB_SIG_DET                    BIT(0)
-
-#define HSIO_S6G_IB_STATUS1_IB_HP_GAIN_STAT(x)            (((x) << 18) & GENMASK(23, 18))
-#define HSIO_S6G_IB_STATUS1_IB_HP_GAIN_STAT_M             GENMASK(23, 18)
-#define HSIO_S6G_IB_STATUS1_IB_HP_GAIN_STAT_X(x)          (((x) & GENMASK(23, 18)) >> 18)
-#define HSIO_S6G_IB_STATUS1_IB_MID_GAIN_STAT(x)           (((x) << 12) & GENMASK(17, 12))
-#define HSIO_S6G_IB_STATUS1_IB_MID_GAIN_STAT_M            GENMASK(17, 12)
-#define HSIO_S6G_IB_STATUS1_IB_MID_GAIN_STAT_X(x)         (((x) & GENMASK(17, 12)) >> 12)
-#define HSIO_S6G_IB_STATUS1_IB_LP_GAIN_STAT(x)            (((x) << 6) & GENMASK(11, 6))
-#define HSIO_S6G_IB_STATUS1_IB_LP_GAIN_STAT_M             GENMASK(11, 6)
-#define HSIO_S6G_IB_STATUS1_IB_LP_GAIN_STAT_X(x)          (((x) & GENMASK(11, 6)) >> 6)
-#define HSIO_S6G_IB_STATUS1_IB_OFFSET_STAT(x)             ((x) & GENMASK(5, 0))
-#define HSIO_S6G_IB_STATUS1_IB_OFFSET_STAT_M              GENMASK(5, 0)
-
-#define HSIO_S6G_ACJTAG_STATUS_ACJTAG_CAPT_DATA_N         BIT(2)
-#define HSIO_S6G_ACJTAG_STATUS_ACJTAG_CAPT_DATA_P         BIT(1)
-#define HSIO_S6G_ACJTAG_STATUS_IB_DIRECT                  BIT(0)
-
-#define HSIO_S6G_PLL_STATUS_PLL_CAL_NOT_DONE              BIT(10)
-#define HSIO_S6G_PLL_STATUS_PLL_CAL_ERR                   BIT(9)
-#define HSIO_S6G_PLL_STATUS_PLL_OUT_OF_RANGE_ERR          BIT(8)
-#define HSIO_S6G_PLL_STATUS_PLL_RB_DATA(x)                ((x) & GENMASK(7, 0))
-#define HSIO_S6G_PLL_STATUS_PLL_RB_DATA_M                 GENMASK(7, 0)
-
-#define HSIO_S6G_REVID_SERDES_REV(x)                      (((x) << 26) & GENMASK(31, 26))
-#define HSIO_S6G_REVID_SERDES_REV_M                       GENMASK(31, 26)
-#define HSIO_S6G_REVID_SERDES_REV_X(x)                    (((x) & GENMASK(31, 26)) >> 26)
-#define HSIO_S6G_REVID_RCPLL_REV(x)                       (((x) << 21) & GENMASK(25, 21))
-#define HSIO_S6G_REVID_RCPLL_REV_M                        GENMASK(25, 21)
-#define HSIO_S6G_REVID_RCPLL_REV_X(x)                     (((x) & GENMASK(25, 21)) >> 21)
-#define HSIO_S6G_REVID_SER_REV(x)                         (((x) << 16) & GENMASK(20, 16))
-#define HSIO_S6G_REVID_SER_REV_M                          GENMASK(20, 16)
-#define HSIO_S6G_REVID_SER_REV_X(x)                       (((x) & GENMASK(20, 16)) >> 16)
-#define HSIO_S6G_REVID_DES_REV(x)                         (((x) << 10) & GENMASK(15, 10))
-#define HSIO_S6G_REVID_DES_REV_M                          GENMASK(15, 10)
-#define HSIO_S6G_REVID_DES_REV_X(x)                       (((x) & GENMASK(15, 10)) >> 10)
-#define HSIO_S6G_REVID_OB_REV(x)                          (((x) << 5) & GENMASK(9, 5))
-#define HSIO_S6G_REVID_OB_REV_M                           GENMASK(9, 5)
-#define HSIO_S6G_REVID_OB_REV_X(x)                        (((x) & GENMASK(9, 5)) >> 5)
-#define HSIO_S6G_REVID_IB_REV(x)                          ((x) & GENMASK(4, 0))
-#define HSIO_S6G_REVID_IB_REV_M                           GENMASK(4, 0)
-
-#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_WR_ONE_SHOT        BIT(31)
-#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_RD_ONE_SHOT        BIT(30)
-#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_ADDR(x)            ((x) & GENMASK(24, 0))
-#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_ADDR_M             GENMASK(24, 0)
-
-#define HSIO_HW_CFG_DEV2G5_10_MODE                        BIT(6)
-#define HSIO_HW_CFG_DEV1G_9_MODE                          BIT(5)
-#define HSIO_HW_CFG_DEV1G_6_MODE                          BIT(4)
-#define HSIO_HW_CFG_DEV1G_5_MODE                          BIT(3)
-#define HSIO_HW_CFG_DEV1G_4_MODE                          BIT(2)
-#define HSIO_HW_CFG_PCIE_ENA                              BIT(1)
-#define HSIO_HW_CFG_QSGMII_ENA                            BIT(0)
-
-#define HSIO_HW_QSGMII_CFG_SHYST_DIS                      BIT(3)
-#define HSIO_HW_QSGMII_CFG_E_DET_ENA                      BIT(2)
-#define HSIO_HW_QSGMII_CFG_USE_I1_ENA                     BIT(1)
-#define HSIO_HW_QSGMII_CFG_FLIP_LANES                     BIT(0)
-
-#define HSIO_HW_QSGMII_STAT_DELAY_VAR_X200PS(x)           (((x) << 1) & GENMASK(6, 1))
-#define HSIO_HW_QSGMII_STAT_DELAY_VAR_X200PS_M            GENMASK(6, 1)
-#define HSIO_HW_QSGMII_STAT_DELAY_VAR_X200PS_X(x)         (((x) & GENMASK(6, 1)) >> 1)
-#define HSIO_HW_QSGMII_STAT_SYNC                          BIT(0)
-
-#define HSIO_CLK_CFG_CLKDIV_PHY(x)                        (((x) << 1) & GENMASK(8, 1))
-#define HSIO_CLK_CFG_CLKDIV_PHY_M                         GENMASK(8, 1)
-#define HSIO_CLK_CFG_CLKDIV_PHY_X(x)                      (((x) & GENMASK(8, 1)) >> 1)
-#define HSIO_CLK_CFG_CLKDIV_PHY_DIS                       BIT(0)
-
-#define HSIO_TEMP_SENSOR_CTRL_FORCE_TEMP_RD               BIT(5)
-#define HSIO_TEMP_SENSOR_CTRL_FORCE_RUN                   BIT(4)
-#define HSIO_TEMP_SENSOR_CTRL_FORCE_NO_RST                BIT(3)
-#define HSIO_TEMP_SENSOR_CTRL_FORCE_POWER_UP              BIT(2)
-#define HSIO_TEMP_SENSOR_CTRL_FORCE_CLK                   BIT(1)
-#define HSIO_TEMP_SENSOR_CTRL_SAMPLE_ENA                  BIT(0)
-
-#define HSIO_TEMP_SENSOR_CFG_RUN_WID(x)                   (((x) << 8) & GENMASK(15, 8))
-#define HSIO_TEMP_SENSOR_CFG_RUN_WID_M                    GENMASK(15, 8)
-#define HSIO_TEMP_SENSOR_CFG_RUN_WID_X(x)                 (((x) & GENMASK(15, 8)) >> 8)
-#define HSIO_TEMP_SENSOR_CFG_SAMPLE_PER(x)                ((x) & GENMASK(7, 0))
-#define HSIO_TEMP_SENSOR_CFG_SAMPLE_PER_M                 GENMASK(7, 0)
-
-#define HSIO_TEMP_SENSOR_STAT_TEMP_VALID                  BIT(8)
-#define HSIO_TEMP_SENSOR_STAT_TEMP(x)                     ((x) & GENMASK(7, 0))
-#define HSIO_TEMP_SENSOR_STAT_TEMP_M                      GENMASK(7, 0)
-
-#endif
diff --git a/include/soc/mscc/ocelot_hsio.h b/include/soc/mscc/ocelot_hsio.h
new file mode 100644
index 000000000000..d93ddec3931b
--- /dev/null
+++ b/include/soc/mscc/ocelot_hsio.h
@@ -0,0 +1,785 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_HSIO_H_
+#define _MSCC_OCELOT_HSIO_H_
+
+#define HSIO_PLL5G_CFG0_ENA_ROT                           BIT(31)
+#define HSIO_PLL5G_CFG0_ENA_LANE                          BIT(30)
+#define HSIO_PLL5G_CFG0_ENA_CLKTREE                       BIT(29)
+#define HSIO_PLL5G_CFG0_DIV4                              BIT(28)
+#define HSIO_PLL5G_CFG0_ENA_LOCK_FINE                     BIT(27)
+#define HSIO_PLL5G_CFG0_SELBGV820(x)                      (((x) << 23) & GENMASK(26, 23))
+#define HSIO_PLL5G_CFG0_SELBGV820_M                       GENMASK(26, 23)
+#define HSIO_PLL5G_CFG0_SELBGV820_X(x)                    (((x) & GENMASK(26, 23)) >> 23)
+#define HSIO_PLL5G_CFG0_LOOP_BW_RES(x)                    (((x) << 18) & GENMASK(22, 18))
+#define HSIO_PLL5G_CFG0_LOOP_BW_RES_M                     GENMASK(22, 18)
+#define HSIO_PLL5G_CFG0_LOOP_BW_RES_X(x)                  (((x) & GENMASK(22, 18)) >> 18)
+#define HSIO_PLL5G_CFG0_SELCPI(x)                         (((x) << 16) & GENMASK(17, 16))
+#define HSIO_PLL5G_CFG0_SELCPI_M                          GENMASK(17, 16)
+#define HSIO_PLL5G_CFG0_SELCPI_X(x)                       (((x) & GENMASK(17, 16)) >> 16)
+#define HSIO_PLL5G_CFG0_ENA_VCO_CONTRH                    BIT(15)
+#define HSIO_PLL5G_CFG0_ENA_CP1                           BIT(14)
+#define HSIO_PLL5G_CFG0_ENA_VCO_BUF                       BIT(13)
+#define HSIO_PLL5G_CFG0_ENA_BIAS                          BIT(12)
+#define HSIO_PLL5G_CFG0_CPU_CLK_DIV(x)                    (((x) << 6) & GENMASK(11, 6))
+#define HSIO_PLL5G_CFG0_CPU_CLK_DIV_M                     GENMASK(11, 6)
+#define HSIO_PLL5G_CFG0_CPU_CLK_DIV_X(x)                  (((x) & GENMASK(11, 6)) >> 6)
+#define HSIO_PLL5G_CFG0_CORE_CLK_DIV(x)                   ((x) & GENMASK(5, 0))
+#define HSIO_PLL5G_CFG0_CORE_CLK_DIV_M                    GENMASK(5, 0)
+
+#define HSIO_PLL5G_CFG1_ENA_DIRECT                        BIT(18)
+#define HSIO_PLL5G_CFG1_ROT_SPEED                         BIT(17)
+#define HSIO_PLL5G_CFG1_ROT_DIR                           BIT(16)
+#define HSIO_PLL5G_CFG1_READBACK_DATA_SEL                 BIT(15)
+#define HSIO_PLL5G_CFG1_RC_ENABLE                         BIT(14)
+#define HSIO_PLL5G_CFG1_RC_CTRL_DATA(x)                   (((x) << 6) & GENMASK(13, 6))
+#define HSIO_PLL5G_CFG1_RC_CTRL_DATA_M                    GENMASK(13, 6)
+#define HSIO_PLL5G_CFG1_RC_CTRL_DATA_X(x)                 (((x) & GENMASK(13, 6)) >> 6)
+#define HSIO_PLL5G_CFG1_QUARTER_RATE                      BIT(5)
+#define HSIO_PLL5G_CFG1_PWD_TX                            BIT(4)
+#define HSIO_PLL5G_CFG1_PWD_RX                            BIT(3)
+#define HSIO_PLL5G_CFG1_OUT_OF_RANGE_RECAL_ENA            BIT(2)
+#define HSIO_PLL5G_CFG1_HALF_RATE                         BIT(1)
+#define HSIO_PLL5G_CFG1_FORCE_SET_ENA                     BIT(0)
+
+#define HSIO_PLL5G_CFG2_ENA_TEST_MODE                     BIT(30)
+#define HSIO_PLL5G_CFG2_ENA_PFD_IN_FLIP                   BIT(29)
+#define HSIO_PLL5G_CFG2_ENA_VCO_NREF_TESTOUT              BIT(28)
+#define HSIO_PLL5G_CFG2_ENA_FBTESTOUT                     BIT(27)
+#define HSIO_PLL5G_CFG2_ENA_RCPLL                         BIT(26)
+#define HSIO_PLL5G_CFG2_ENA_CP2                           BIT(25)
+#define HSIO_PLL5G_CFG2_ENA_CLK_BYPASS1                   BIT(24)
+#define HSIO_PLL5G_CFG2_AMPC_SEL(x)                       (((x) << 16) & GENMASK(23, 16))
+#define HSIO_PLL5G_CFG2_AMPC_SEL_M                        GENMASK(23, 16)
+#define HSIO_PLL5G_CFG2_AMPC_SEL_X(x)                     (((x) & GENMASK(23, 16)) >> 16)
+#define HSIO_PLL5G_CFG2_ENA_CLK_BYPASS                    BIT(15)
+#define HSIO_PLL5G_CFG2_PWD_AMPCTRL_N                     BIT(14)
+#define HSIO_PLL5G_CFG2_ENA_AMPCTRL                       BIT(13)
+#define HSIO_PLL5G_CFG2_ENA_AMP_CTRL_FORCE                BIT(12)
+#define HSIO_PLL5G_CFG2_FRC_FSM_POR                       BIT(11)
+#define HSIO_PLL5G_CFG2_DISABLE_FSM_POR                   BIT(10)
+#define HSIO_PLL5G_CFG2_GAIN_TEST(x)                      (((x) << 5) & GENMASK(9, 5))
+#define HSIO_PLL5G_CFG2_GAIN_TEST_M                       GENMASK(9, 5)
+#define HSIO_PLL5G_CFG2_GAIN_TEST_X(x)                    (((x) & GENMASK(9, 5)) >> 5)
+#define HSIO_PLL5G_CFG2_EN_RESET_OVERRUN                  BIT(4)
+#define HSIO_PLL5G_CFG2_EN_RESET_LIM_DET                  BIT(3)
+#define HSIO_PLL5G_CFG2_EN_RESET_FRQ_DET                  BIT(2)
+#define HSIO_PLL5G_CFG2_DISABLE_FSM                       BIT(1)
+#define HSIO_PLL5G_CFG2_ENA_GAIN_TEST                     BIT(0)
+
+#define HSIO_PLL5G_CFG3_TEST_ANA_OUT_SEL(x)               (((x) << 22) & GENMASK(23, 22))
+#define HSIO_PLL5G_CFG3_TEST_ANA_OUT_SEL_M                GENMASK(23, 22)
+#define HSIO_PLL5G_CFG3_TEST_ANA_OUT_SEL_X(x)             (((x) & GENMASK(23, 22)) >> 22)
+#define HSIO_PLL5G_CFG3_TESTOUT_SEL(x)                    (((x) << 19) & GENMASK(21, 19))
+#define HSIO_PLL5G_CFG3_TESTOUT_SEL_M                     GENMASK(21, 19)
+#define HSIO_PLL5G_CFG3_TESTOUT_SEL_X(x)                  (((x) & GENMASK(21, 19)) >> 19)
+#define HSIO_PLL5G_CFG3_ENA_ANA_TEST_OUT                  BIT(18)
+#define HSIO_PLL5G_CFG3_ENA_TEST_OUT                      BIT(17)
+#define HSIO_PLL5G_CFG3_SEL_FBDCLK                        BIT(16)
+#define HSIO_PLL5G_CFG3_SEL_CML_CMOS_PFD                  BIT(15)
+#define HSIO_PLL5G_CFG3_RST_FB_N                          BIT(14)
+#define HSIO_PLL5G_CFG3_FORCE_VCO_CONTRH                  BIT(13)
+#define HSIO_PLL5G_CFG3_FORCE_LO                          BIT(12)
+#define HSIO_PLL5G_CFG3_FORCE_HI                          BIT(11)
+#define HSIO_PLL5G_CFG3_FORCE_ENA                         BIT(10)
+#define HSIO_PLL5G_CFG3_FORCE_CP                          BIT(9)
+#define HSIO_PLL5G_CFG3_FBDIVSEL_TST_ENA                  BIT(8)
+#define HSIO_PLL5G_CFG3_FBDIVSEL(x)                       ((x) & GENMASK(7, 0))
+#define HSIO_PLL5G_CFG3_FBDIVSEL_M                        GENMASK(7, 0)
+
+#define HSIO_PLL5G_CFG4_IB_BIAS_CTRL(x)                   (((x) << 16) & GENMASK(23, 16))
+#define HSIO_PLL5G_CFG4_IB_BIAS_CTRL_M                    GENMASK(23, 16)
+#define HSIO_PLL5G_CFG4_IB_BIAS_CTRL_X(x)                 (((x) & GENMASK(23, 16)) >> 16)
+#define HSIO_PLL5G_CFG4_IB_CTRL(x)                        ((x) & GENMASK(15, 0))
+#define HSIO_PLL5G_CFG4_IB_CTRL_M                         GENMASK(15, 0)
+
+#define HSIO_PLL5G_CFG5_OB_BIAS_CTRL(x)                   (((x) << 16) & GENMASK(23, 16))
+#define HSIO_PLL5G_CFG5_OB_BIAS_CTRL_M                    GENMASK(23, 16)
+#define HSIO_PLL5G_CFG5_OB_BIAS_CTRL_X(x)                 (((x) & GENMASK(23, 16)) >> 16)
+#define HSIO_PLL5G_CFG5_OB_CTRL(x)                        ((x) & GENMASK(15, 0))
+#define HSIO_PLL5G_CFG5_OB_CTRL_M                         GENMASK(15, 0)
+
+#define HSIO_PLL5G_CFG6_REFCLK_SEL_SRC                    BIT(23)
+#define HSIO_PLL5G_CFG6_REFCLK_SEL(x)                     (((x) << 20) & GENMASK(22, 20))
+#define HSIO_PLL5G_CFG6_REFCLK_SEL_M                      GENMASK(22, 20)
+#define HSIO_PLL5G_CFG6_REFCLK_SEL_X(x)                   (((x) & GENMASK(22, 20)) >> 20)
+#define HSIO_PLL5G_CFG6_REFCLK_SRC                        BIT(19)
+#define HSIO_PLL5G_CFG6_POR_DEL_SEL(x)                    (((x) << 16) & GENMASK(17, 16))
+#define HSIO_PLL5G_CFG6_POR_DEL_SEL_M                     GENMASK(17, 16)
+#define HSIO_PLL5G_CFG6_POR_DEL_SEL_X(x)                  (((x) & GENMASK(17, 16)) >> 16)
+#define HSIO_PLL5G_CFG6_DIV125REF_SEL(x)                  (((x) << 8) & GENMASK(15, 8))
+#define HSIO_PLL5G_CFG6_DIV125REF_SEL_M                   GENMASK(15, 8)
+#define HSIO_PLL5G_CFG6_DIV125REF_SEL_X(x)                (((x) & GENMASK(15, 8)) >> 8)
+#define HSIO_PLL5G_CFG6_ENA_REFCLKC2                      BIT(7)
+#define HSIO_PLL5G_CFG6_ENA_FBCLKC2                       BIT(6)
+#define HSIO_PLL5G_CFG6_DDR_CLK_DIV(x)                    ((x) & GENMASK(5, 0))
+#define HSIO_PLL5G_CFG6_DDR_CLK_DIV_M                     GENMASK(5, 0)
+
+#define HSIO_PLL5G_STATUS0_RANGE_LIM                      BIT(12)
+#define HSIO_PLL5G_STATUS0_OUT_OF_RANGE_ERR               BIT(11)
+#define HSIO_PLL5G_STATUS0_CALIBRATION_ERR                BIT(10)
+#define HSIO_PLL5G_STATUS0_CALIBRATION_DONE               BIT(9)
+#define HSIO_PLL5G_STATUS0_READBACK_DATA(x)               (((x) << 1) & GENMASK(8, 1))
+#define HSIO_PLL5G_STATUS0_READBACK_DATA_M                GENMASK(8, 1)
+#define HSIO_PLL5G_STATUS0_READBACK_DATA_X(x)             (((x) & GENMASK(8, 1)) >> 1)
+#define HSIO_PLL5G_STATUS0_LOCK_STATUS                    BIT(0)
+
+#define HSIO_PLL5G_STATUS1_SIG_DEL(x)                     (((x) << 21) & GENMASK(28, 21))
+#define HSIO_PLL5G_STATUS1_SIG_DEL_M                      GENMASK(28, 21)
+#define HSIO_PLL5G_STATUS1_SIG_DEL_X(x)                   (((x) & GENMASK(28, 21)) >> 21)
+#define HSIO_PLL5G_STATUS1_GAIN_STAT(x)                   (((x) << 16) & GENMASK(20, 16))
+#define HSIO_PLL5G_STATUS1_GAIN_STAT_M                    GENMASK(20, 16)
+#define HSIO_PLL5G_STATUS1_GAIN_STAT_X(x)                 (((x) & GENMASK(20, 16)) >> 16)
+#define HSIO_PLL5G_STATUS1_FBCNT_DIF(x)                   (((x) << 4) & GENMASK(13, 4))
+#define HSIO_PLL5G_STATUS1_FBCNT_DIF_M                    GENMASK(13, 4)
+#define HSIO_PLL5G_STATUS1_FBCNT_DIF_X(x)                 (((x) & GENMASK(13, 4)) >> 4)
+#define HSIO_PLL5G_STATUS1_FSM_STAT(x)                    (((x) << 1) & GENMASK(3, 1))
+#define HSIO_PLL5G_STATUS1_FSM_STAT_M                     GENMASK(3, 1)
+#define HSIO_PLL5G_STATUS1_FSM_STAT_X(x)                  (((x) & GENMASK(3, 1)) >> 1)
+#define HSIO_PLL5G_STATUS1_FSM_LOCK                       BIT(0)
+
+#define HSIO_PLL5G_BIST_CFG0_PLLB_START_BIST              BIT(31)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_MEAS_MODE               BIT(30)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_REPEAT(x)          (((x) << 20) & GENMASK(23, 20))
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_REPEAT_M           GENMASK(23, 20)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_REPEAT_X(x)        (((x) & GENMASK(23, 20)) >> 20)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_UNCERT(x)          (((x) << 16) & GENMASK(19, 16))
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_UNCERT_M           GENMASK(19, 16)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_UNCERT_X(x)        (((x) & GENMASK(19, 16)) >> 16)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_DIV_FACTOR_PRE(x)       ((x) & GENMASK(15, 0))
+#define HSIO_PLL5G_BIST_CFG0_PLLB_DIV_FACTOR_PRE_M        GENMASK(15, 0)
+
+#define HSIO_PLL5G_BIST_STAT0_PLLB_FSM_STAT(x)            (((x) << 4) & GENMASK(7, 4))
+#define HSIO_PLL5G_BIST_STAT0_PLLB_FSM_STAT_M             GENMASK(7, 4)
+#define HSIO_PLL5G_BIST_STAT0_PLLB_FSM_STAT_X(x)          (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_PLL5G_BIST_STAT0_PLLB_BUSY                   BIT(2)
+#define HSIO_PLL5G_BIST_STAT0_PLLB_DONE_N                 BIT(1)
+#define HSIO_PLL5G_BIST_STAT0_PLLB_FAIL                   BIT(0)
+
+#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_OUT(x)             (((x) << 16) & GENMASK(31, 16))
+#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_OUT_M              GENMASK(31, 16)
+#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_OUT_X(x)           (((x) & GENMASK(31, 16)) >> 16)
+#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_REF_DIFF(x)        ((x) & GENMASK(15, 0))
+#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_REF_DIFF_M         GENMASK(15, 0)
+
+#define HSIO_RCOMP_CFG0_PWD_ENA                           BIT(13)
+#define HSIO_RCOMP_CFG0_RUN_CAL                           BIT(12)
+#define HSIO_RCOMP_CFG0_SPEED_SEL(x)                      (((x) << 10) & GENMASK(11, 10))
+#define HSIO_RCOMP_CFG0_SPEED_SEL_M                       GENMASK(11, 10)
+#define HSIO_RCOMP_CFG0_SPEED_SEL_X(x)                    (((x) & GENMASK(11, 10)) >> 10)
+#define HSIO_RCOMP_CFG0_MODE_SEL(x)                       (((x) << 8) & GENMASK(9, 8))
+#define HSIO_RCOMP_CFG0_MODE_SEL_M                        GENMASK(9, 8)
+#define HSIO_RCOMP_CFG0_MODE_SEL_X(x)                     (((x) & GENMASK(9, 8)) >> 8)
+#define HSIO_RCOMP_CFG0_FORCE_ENA                         BIT(4)
+#define HSIO_RCOMP_CFG0_RCOMP_VAL(x)                      ((x) & GENMASK(3, 0))
+#define HSIO_RCOMP_CFG0_RCOMP_VAL_M                       GENMASK(3, 0)
+
+#define HSIO_RCOMP_STATUS_BUSY                            BIT(12)
+#define HSIO_RCOMP_STATUS_DELTA_ALERT                     BIT(7)
+#define HSIO_RCOMP_STATUS_RCOMP(x)                        ((x) & GENMASK(3, 0))
+#define HSIO_RCOMP_STATUS_RCOMP_M                         GENMASK(3, 0)
+
+#define HSIO_SYNC_ETH_CFG_RSZ                             0x4
+
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_SRC(x)             (((x) << 4) & GENMASK(7, 4))
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_SRC_M              GENMASK(7, 4)
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_SRC_X(x)           (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_DIV(x)             (((x) << 1) & GENMASK(3, 1))
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_DIV_M              GENMASK(3, 1)
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_DIV_X(x)           (((x) & GENMASK(3, 1)) >> 1)
+#define HSIO_SYNC_ETH_CFG_RECO_CLK_ENA                    BIT(0)
+
+#define HSIO_SYNC_ETH_PLL_CFG_PLL_AUTO_SQUELCH_ENA        BIT(0)
+
+#define HSIO_S1G_DES_CFG_DES_PHS_CTRL(x)                  (((x) << 13) & GENMASK(16, 13))
+#define HSIO_S1G_DES_CFG_DES_PHS_CTRL_M                   GENMASK(16, 13)
+#define HSIO_S1G_DES_CFG_DES_PHS_CTRL_X(x)                (((x) & GENMASK(16, 13)) >> 13)
+#define HSIO_S1G_DES_CFG_DES_CPMD_SEL(x)                  (((x) << 11) & GENMASK(12, 11))
+#define HSIO_S1G_DES_CFG_DES_CPMD_SEL_M                   GENMASK(12, 11)
+#define HSIO_S1G_DES_CFG_DES_CPMD_SEL_X(x)                (((x) & GENMASK(12, 11)) >> 11)
+#define HSIO_S1G_DES_CFG_DES_MBTR_CTRL(x)                 (((x) << 8) & GENMASK(10, 8))
+#define HSIO_S1G_DES_CFG_DES_MBTR_CTRL_M                  GENMASK(10, 8)
+#define HSIO_S1G_DES_CFG_DES_MBTR_CTRL_X(x)               (((x) & GENMASK(10, 8)) >> 8)
+#define HSIO_S1G_DES_CFG_DES_BW_ANA(x)                    (((x) << 5) & GENMASK(7, 5))
+#define HSIO_S1G_DES_CFG_DES_BW_ANA_M                     GENMASK(7, 5)
+#define HSIO_S1G_DES_CFG_DES_BW_ANA_X(x)                  (((x) & GENMASK(7, 5)) >> 5)
+#define HSIO_S1G_DES_CFG_DES_SWAP_ANA                     BIT(4)
+#define HSIO_S1G_DES_CFG_DES_BW_HYST(x)                   (((x) << 1) & GENMASK(3, 1))
+#define HSIO_S1G_DES_CFG_DES_BW_HYST_M                    GENMASK(3, 1)
+#define HSIO_S1G_DES_CFG_DES_BW_HYST_X(x)                 (((x) & GENMASK(3, 1)) >> 1)
+#define HSIO_S1G_DES_CFG_DES_SWAP_HYST                    BIT(0)
+
+#define HSIO_S1G_IB_CFG_IB_FX100_ENA                      BIT(27)
+#define HSIO_S1G_IB_CFG_ACJTAG_HYST(x)                    (((x) << 24) & GENMASK(26, 24))
+#define HSIO_S1G_IB_CFG_ACJTAG_HYST_M                     GENMASK(26, 24)
+#define HSIO_S1G_IB_CFG_ACJTAG_HYST_X(x)                  (((x) & GENMASK(26, 24)) >> 24)
+#define HSIO_S1G_IB_CFG_IB_DET_LEV(x)                     (((x) << 19) & GENMASK(21, 19))
+#define HSIO_S1G_IB_CFG_IB_DET_LEV_M                      GENMASK(21, 19)
+#define HSIO_S1G_IB_CFG_IB_DET_LEV_X(x)                   (((x) & GENMASK(21, 19)) >> 19)
+#define HSIO_S1G_IB_CFG_IB_HYST_LEV                       BIT(14)
+#define HSIO_S1G_IB_CFG_IB_ENA_CMV_TERM                   BIT(13)
+#define HSIO_S1G_IB_CFG_IB_ENA_DC_COUPLING                BIT(12)
+#define HSIO_S1G_IB_CFG_IB_ENA_DETLEV                     BIT(11)
+#define HSIO_S1G_IB_CFG_IB_ENA_HYST                       BIT(10)
+#define HSIO_S1G_IB_CFG_IB_ENA_OFFSET_COMP                BIT(9)
+#define HSIO_S1G_IB_CFG_IB_EQ_GAIN(x)                     (((x) << 6) & GENMASK(8, 6))
+#define HSIO_S1G_IB_CFG_IB_EQ_GAIN_M                      GENMASK(8, 6)
+#define HSIO_S1G_IB_CFG_IB_EQ_GAIN_X(x)                   (((x) & GENMASK(8, 6)) >> 6)
+#define HSIO_S1G_IB_CFG_IB_SEL_CORNER_FREQ(x)             (((x) << 4) & GENMASK(5, 4))
+#define HSIO_S1G_IB_CFG_IB_SEL_CORNER_FREQ_M              GENMASK(5, 4)
+#define HSIO_S1G_IB_CFG_IB_SEL_CORNER_FREQ_X(x)           (((x) & GENMASK(5, 4)) >> 4)
+#define HSIO_S1G_IB_CFG_IB_RESISTOR_CTRL(x)               ((x) & GENMASK(3, 0))
+#define HSIO_S1G_IB_CFG_IB_RESISTOR_CTRL_M                GENMASK(3, 0)
+
+#define HSIO_S1G_OB_CFG_OB_SLP(x)                         (((x) << 17) & GENMASK(18, 17))
+#define HSIO_S1G_OB_CFG_OB_SLP_M                          GENMASK(18, 17)
+#define HSIO_S1G_OB_CFG_OB_SLP_X(x)                       (((x) & GENMASK(18, 17)) >> 17)
+#define HSIO_S1G_OB_CFG_OB_AMP_CTRL(x)                    (((x) << 13) & GENMASK(16, 13))
+#define HSIO_S1G_OB_CFG_OB_AMP_CTRL_M                     GENMASK(16, 13)
+#define HSIO_S1G_OB_CFG_OB_AMP_CTRL_X(x)                  (((x) & GENMASK(16, 13)) >> 13)
+#define HSIO_S1G_OB_CFG_OB_CMM_BIAS_CTRL(x)               (((x) << 10) & GENMASK(12, 10))
+#define HSIO_S1G_OB_CFG_OB_CMM_BIAS_CTRL_M                GENMASK(12, 10)
+#define HSIO_S1G_OB_CFG_OB_CMM_BIAS_CTRL_X(x)             (((x) & GENMASK(12, 10)) >> 10)
+#define HSIO_S1G_OB_CFG_OB_DIS_VCM_CTRL                   BIT(9)
+#define HSIO_S1G_OB_CFG_OB_EN_MEAS_VREG                   BIT(8)
+#define HSIO_S1G_OB_CFG_OB_VCM_CTRL(x)                    (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S1G_OB_CFG_OB_VCM_CTRL_M                     GENMASK(7, 4)
+#define HSIO_S1G_OB_CFG_OB_VCM_CTRL_X(x)                  (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S1G_OB_CFG_OB_RESISTOR_CTRL(x)               ((x) & GENMASK(3, 0))
+#define HSIO_S1G_OB_CFG_OB_RESISTOR_CTRL_M                GENMASK(3, 0)
+
+#define HSIO_S1G_SER_CFG_SER_IDLE                         BIT(9)
+#define HSIO_S1G_SER_CFG_SER_DEEMPH                       BIT(8)
+#define HSIO_S1G_SER_CFG_SER_CPMD_SEL                     BIT(7)
+#define HSIO_S1G_SER_CFG_SER_SWAP_CPMD                    BIT(6)
+#define HSIO_S1G_SER_CFG_SER_ALISEL(x)                    (((x) << 4) & GENMASK(5, 4))
+#define HSIO_S1G_SER_CFG_SER_ALISEL_M                     GENMASK(5, 4)
+#define HSIO_S1G_SER_CFG_SER_ALISEL_X(x)                  (((x) & GENMASK(5, 4)) >> 4)
+#define HSIO_S1G_SER_CFG_SER_ENHYS                        BIT(3)
+#define HSIO_S1G_SER_CFG_SER_BIG_WIN                      BIT(2)
+#define HSIO_S1G_SER_CFG_SER_EN_WIN                       BIT(1)
+#define HSIO_S1G_SER_CFG_SER_ENALI                        BIT(0)
+
+#define HSIO_S1G_COMMON_CFG_SYS_RST                       BIT(31)
+#define HSIO_S1G_COMMON_CFG_SE_AUTO_SQUELCH_ENA           BIT(21)
+#define HSIO_S1G_COMMON_CFG_ENA_LANE                      BIT(18)
+#define HSIO_S1G_COMMON_CFG_PWD_RX                        BIT(17)
+#define HSIO_S1G_COMMON_CFG_PWD_TX                        BIT(16)
+#define HSIO_S1G_COMMON_CFG_LANE_CTRL(x)                  (((x) << 13) & GENMASK(15, 13))
+#define HSIO_S1G_COMMON_CFG_LANE_CTRL_M                   GENMASK(15, 13)
+#define HSIO_S1G_COMMON_CFG_LANE_CTRL_X(x)                (((x) & GENMASK(15, 13)) >> 13)
+#define HSIO_S1G_COMMON_CFG_ENA_DIRECT                    BIT(12)
+#define HSIO_S1G_COMMON_CFG_ENA_ELOOP                     BIT(11)
+#define HSIO_S1G_COMMON_CFG_ENA_FLOOP                     BIT(10)
+#define HSIO_S1G_COMMON_CFG_ENA_ILOOP                     BIT(9)
+#define HSIO_S1G_COMMON_CFG_ENA_PLOOP                     BIT(8)
+#define HSIO_S1G_COMMON_CFG_HRATE                         BIT(7)
+#define HSIO_S1G_COMMON_CFG_IF_MODE                       BIT(0)
+
+#define HSIO_S1G_PLL_CFG_PLL_ENA_FB_DIV2                  BIT(22)
+#define HSIO_S1G_PLL_CFG_PLL_ENA_RC_DIV2                  BIT(21)
+#define HSIO_S1G_PLL_CFG_PLL_FSM_CTRL_DATA(x)             (((x) << 8) & GENMASK(15, 8))
+#define HSIO_S1G_PLL_CFG_PLL_FSM_CTRL_DATA_M              GENMASK(15, 8)
+#define HSIO_S1G_PLL_CFG_PLL_FSM_CTRL_DATA_X(x)           (((x) & GENMASK(15, 8)) >> 8)
+#define HSIO_S1G_PLL_CFG_PLL_FSM_ENA                      BIT(7)
+#define HSIO_S1G_PLL_CFG_PLL_FSM_FORCE_SET_ENA            BIT(6)
+#define HSIO_S1G_PLL_CFG_PLL_FSM_OOR_RECAL_ENA            BIT(5)
+#define HSIO_S1G_PLL_CFG_PLL_RB_DATA_SEL                  BIT(3)
+
+#define HSIO_S1G_PLL_STATUS_PLL_CAL_NOT_DONE              BIT(12)
+#define HSIO_S1G_PLL_STATUS_PLL_CAL_ERR                   BIT(11)
+#define HSIO_S1G_PLL_STATUS_PLL_OUT_OF_RANGE_ERR          BIT(10)
+#define HSIO_S1G_PLL_STATUS_PLL_RB_DATA(x)                ((x) & GENMASK(7, 0))
+#define HSIO_S1G_PLL_STATUS_PLL_RB_DATA_M                 GENMASK(7, 0)
+
+#define HSIO_S1G_DFT_CFG0_LAZYBIT                         BIT(31)
+#define HSIO_S1G_DFT_CFG0_INV_DIS                         BIT(23)
+#define HSIO_S1G_DFT_CFG0_PRBS_SEL(x)                     (((x) << 20) & GENMASK(21, 20))
+#define HSIO_S1G_DFT_CFG0_PRBS_SEL_M                      GENMASK(21, 20)
+#define HSIO_S1G_DFT_CFG0_PRBS_SEL_X(x)                   (((x) & GENMASK(21, 20)) >> 20)
+#define HSIO_S1G_DFT_CFG0_TEST_MODE(x)                    (((x) << 16) & GENMASK(18, 16))
+#define HSIO_S1G_DFT_CFG0_TEST_MODE_M                     GENMASK(18, 16)
+#define HSIO_S1G_DFT_CFG0_TEST_MODE_X(x)                  (((x) & GENMASK(18, 16)) >> 16)
+#define HSIO_S1G_DFT_CFG0_RX_PHS_CORR_DIS                 BIT(4)
+#define HSIO_S1G_DFT_CFG0_RX_PDSENS_ENA                   BIT(3)
+#define HSIO_S1G_DFT_CFG0_RX_DFT_ENA                      BIT(2)
+#define HSIO_S1G_DFT_CFG0_TX_DFT_ENA                      BIT(0)
+
+#define HSIO_S1G_DFT_CFG1_TX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
+#define HSIO_S1G_DFT_CFG1_TX_JITTER_AMPL_M                GENMASK(17, 8)
+#define HSIO_S1G_DFT_CFG1_TX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
+#define HSIO_S1G_DFT_CFG1_TX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S1G_DFT_CFG1_TX_STEP_FREQ_M                  GENMASK(7, 4)
+#define HSIO_S1G_DFT_CFG1_TX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S1G_DFT_CFG1_TX_JI_ENA                       BIT(3)
+#define HSIO_S1G_DFT_CFG1_TX_WAVEFORM_SEL                 BIT(2)
+#define HSIO_S1G_DFT_CFG1_TX_FREQOFF_DIR                  BIT(1)
+#define HSIO_S1G_DFT_CFG1_TX_FREQOFF_ENA                  BIT(0)
+
+#define HSIO_S1G_DFT_CFG2_RX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
+#define HSIO_S1G_DFT_CFG2_RX_JITTER_AMPL_M                GENMASK(17, 8)
+#define HSIO_S1G_DFT_CFG2_RX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
+#define HSIO_S1G_DFT_CFG2_RX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S1G_DFT_CFG2_RX_STEP_FREQ_M                  GENMASK(7, 4)
+#define HSIO_S1G_DFT_CFG2_RX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S1G_DFT_CFG2_RX_JI_ENA                       BIT(3)
+#define HSIO_S1G_DFT_CFG2_RX_WAVEFORM_SEL                 BIT(2)
+#define HSIO_S1G_DFT_CFG2_RX_FREQOFF_DIR                  BIT(1)
+#define HSIO_S1G_DFT_CFG2_RX_FREQOFF_ENA                  BIT(0)
+
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_ENA             BIT(20)
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH(x)     (((x) << 16) & GENMASK(17, 16))
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_M      GENMASK(17, 16)
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_X(x)   (((x) & GENMASK(17, 16)) >> 16)
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_HIGH(x)         (((x) << 8) & GENMASK(15, 8))
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_M          GENMASK(15, 8)
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_X(x)       (((x) & GENMASK(15, 8)) >> 8)
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_LOW(x)          ((x) & GENMASK(7, 0))
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_LOW_M           GENMASK(7, 0)
+
+#define HSIO_S1G_MISC_CFG_DES_100FX_KICK_MODE(x)          (((x) << 11) & GENMASK(12, 11))
+#define HSIO_S1G_MISC_CFG_DES_100FX_KICK_MODE_M           GENMASK(12, 11)
+#define HSIO_S1G_MISC_CFG_DES_100FX_KICK_MODE_X(x)        (((x) & GENMASK(12, 11)) >> 11)
+#define HSIO_S1G_MISC_CFG_DES_100FX_CPMD_SWAP             BIT(10)
+#define HSIO_S1G_MISC_CFG_DES_100FX_CPMD_MODE             BIT(9)
+#define HSIO_S1G_MISC_CFG_DES_100FX_CPMD_ENA              BIT(8)
+#define HSIO_S1G_MISC_CFG_RX_LPI_MODE_ENA                 BIT(5)
+#define HSIO_S1G_MISC_CFG_TX_LPI_MODE_ENA                 BIT(4)
+#define HSIO_S1G_MISC_CFG_RX_DATA_INV_ENA                 BIT(3)
+#define HSIO_S1G_MISC_CFG_TX_DATA_INV_ENA                 BIT(2)
+#define HSIO_S1G_MISC_CFG_LANE_RST                        BIT(0)
+
+#define HSIO_S1G_DFT_STATUS_PLL_BIST_NOT_DONE             BIT(7)
+#define HSIO_S1G_DFT_STATUS_PLL_BIST_FAILED               BIT(6)
+#define HSIO_S1G_DFT_STATUS_PLL_BIST_TIMEOUT_ERR          BIT(5)
+#define HSIO_S1G_DFT_STATUS_BIST_ACTIVE                   BIT(3)
+#define HSIO_S1G_DFT_STATUS_BIST_NOSYNC                   BIT(2)
+#define HSIO_S1G_DFT_STATUS_BIST_COMPLETE_N               BIT(1)
+#define HSIO_S1G_DFT_STATUS_BIST_ERROR                    BIT(0)
+
+#define HSIO_S1G_MISC_STATUS_DES_100FX_PHASE_SEL          BIT(0)
+
+#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_WR_ONE_SHOT        BIT(31)
+#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_RD_ONE_SHOT        BIT(30)
+#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_ADDR(x)            ((x) & GENMASK(8, 0))
+#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_ADDR_M             GENMASK(8, 0)
+
+#define HSIO_S6G_DIG_CFG_GP(x)                            (((x) << 16) & GENMASK(18, 16))
+#define HSIO_S6G_DIG_CFG_GP_M                             GENMASK(18, 16)
+#define HSIO_S6G_DIG_CFG_GP_X(x)                          (((x) & GENMASK(18, 16)) >> 16)
+#define HSIO_S6G_DIG_CFG_TX_BIT_DOUBLING_MODE_ENA         BIT(7)
+#define HSIO_S6G_DIG_CFG_SIGDET_TESTMODE                  BIT(6)
+#define HSIO_S6G_DIG_CFG_SIGDET_AST(x)                    (((x) << 3) & GENMASK(5, 3))
+#define HSIO_S6G_DIG_CFG_SIGDET_AST_M                     GENMASK(5, 3)
+#define HSIO_S6G_DIG_CFG_SIGDET_AST_X(x)                  (((x) & GENMASK(5, 3)) >> 3)
+#define HSIO_S6G_DIG_CFG_SIGDET_DST(x)                    ((x) & GENMASK(2, 0))
+#define HSIO_S6G_DIG_CFG_SIGDET_DST_M                     GENMASK(2, 0)
+
+#define HSIO_S6G_DFT_CFG0_LAZYBIT                         BIT(31)
+#define HSIO_S6G_DFT_CFG0_INV_DIS                         BIT(23)
+#define HSIO_S6G_DFT_CFG0_PRBS_SEL(x)                     (((x) << 20) & GENMASK(21, 20))
+#define HSIO_S6G_DFT_CFG0_PRBS_SEL_M                      GENMASK(21, 20)
+#define HSIO_S6G_DFT_CFG0_PRBS_SEL_X(x)                   (((x) & GENMASK(21, 20)) >> 20)
+#define HSIO_S6G_DFT_CFG0_TEST_MODE(x)                    (((x) << 16) & GENMASK(18, 16))
+#define HSIO_S6G_DFT_CFG0_TEST_MODE_M                     GENMASK(18, 16)
+#define HSIO_S6G_DFT_CFG0_TEST_MODE_X(x)                  (((x) & GENMASK(18, 16)) >> 16)
+#define HSIO_S6G_DFT_CFG0_RX_PHS_CORR_DIS                 BIT(4)
+#define HSIO_S6G_DFT_CFG0_RX_PDSENS_ENA                   BIT(3)
+#define HSIO_S6G_DFT_CFG0_RX_DFT_ENA                      BIT(2)
+#define HSIO_S6G_DFT_CFG0_TX_DFT_ENA                      BIT(0)
+
+#define HSIO_S6G_DFT_CFG1_TX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
+#define HSIO_S6G_DFT_CFG1_TX_JITTER_AMPL_M                GENMASK(17, 8)
+#define HSIO_S6G_DFT_CFG1_TX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
+#define HSIO_S6G_DFT_CFG1_TX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S6G_DFT_CFG1_TX_STEP_FREQ_M                  GENMASK(7, 4)
+#define HSIO_S6G_DFT_CFG1_TX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S6G_DFT_CFG1_TX_JI_ENA                       BIT(3)
+#define HSIO_S6G_DFT_CFG1_TX_WAVEFORM_SEL                 BIT(2)
+#define HSIO_S6G_DFT_CFG1_TX_FREQOFF_DIR                  BIT(1)
+#define HSIO_S6G_DFT_CFG1_TX_FREQOFF_ENA                  BIT(0)
+
+#define HSIO_S6G_DFT_CFG2_RX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
+#define HSIO_S6G_DFT_CFG2_RX_JITTER_AMPL_M                GENMASK(17, 8)
+#define HSIO_S6G_DFT_CFG2_RX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
+#define HSIO_S6G_DFT_CFG2_RX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S6G_DFT_CFG2_RX_STEP_FREQ_M                  GENMASK(7, 4)
+#define HSIO_S6G_DFT_CFG2_RX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S6G_DFT_CFG2_RX_JI_ENA                       BIT(3)
+#define HSIO_S6G_DFT_CFG2_RX_WAVEFORM_SEL                 BIT(2)
+#define HSIO_S6G_DFT_CFG2_RX_FREQOFF_DIR                  BIT(1)
+#define HSIO_S6G_DFT_CFG2_RX_FREQOFF_ENA                  BIT(0)
+
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_ENA             BIT(20)
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH(x)     (((x) << 16) & GENMASK(19, 16))
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_M      GENMASK(19, 16)
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_X(x)   (((x) & GENMASK(19, 16)) >> 16)
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_HIGH(x)         (((x) << 8) & GENMASK(15, 8))
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_M          GENMASK(15, 8)
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_X(x)       (((x) & GENMASK(15, 8)) >> 8)
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_LOW(x)          ((x) & GENMASK(7, 0))
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_LOW_M           GENMASK(7, 0)
+
+#define HSIO_S6G_MISC_CFG_SEL_RECO_CLK(x)                 (((x) << 13) & GENMASK(14, 13))
+#define HSIO_S6G_MISC_CFG_SEL_RECO_CLK_M                  GENMASK(14, 13)
+#define HSIO_S6G_MISC_CFG_SEL_RECO_CLK_X(x)               (((x) & GENMASK(14, 13)) >> 13)
+#define HSIO_S6G_MISC_CFG_DES_100FX_KICK_MODE(x)          (((x) << 11) & GENMASK(12, 11))
+#define HSIO_S6G_MISC_CFG_DES_100FX_KICK_MODE_M           GENMASK(12, 11)
+#define HSIO_S6G_MISC_CFG_DES_100FX_KICK_MODE_X(x)        (((x) & GENMASK(12, 11)) >> 11)
+#define HSIO_S6G_MISC_CFG_DES_100FX_CPMD_SWAP             BIT(10)
+#define HSIO_S6G_MISC_CFG_DES_100FX_CPMD_MODE             BIT(9)
+#define HSIO_S6G_MISC_CFG_DES_100FX_CPMD_ENA              BIT(8)
+#define HSIO_S6G_MISC_CFG_RX_BUS_FLIP_ENA                 BIT(7)
+#define HSIO_S6G_MISC_CFG_TX_BUS_FLIP_ENA                 BIT(6)
+#define HSIO_S6G_MISC_CFG_RX_LPI_MODE_ENA                 BIT(5)
+#define HSIO_S6G_MISC_CFG_TX_LPI_MODE_ENA                 BIT(4)
+#define HSIO_S6G_MISC_CFG_RX_DATA_INV_ENA                 BIT(3)
+#define HSIO_S6G_MISC_CFG_TX_DATA_INV_ENA                 BIT(2)
+#define HSIO_S6G_MISC_CFG_LANE_RST                        BIT(0)
+
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST0(x)               (((x) << 23) & GENMASK(28, 23))
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST0_M                GENMASK(28, 23)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST0_X(x)             (((x) & GENMASK(28, 23)) >> 23)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST1(x)               (((x) << 18) & GENMASK(22, 18))
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST1_M                GENMASK(22, 18)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST1_X(x)             (((x) & GENMASK(22, 18)) >> 18)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_PREC(x)                (((x) << 13) & GENMASK(17, 13))
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_PREC_M                 GENMASK(17, 13)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_PREC_X(x)              (((x) & GENMASK(17, 13)) >> 13)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_ENA_CAS(x)             (((x) << 6) & GENMASK(8, 6))
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_ENA_CAS_M              GENMASK(8, 6)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_ENA_CAS_X(x)           (((x) & GENMASK(8, 6)) >> 6)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_LEV(x)                 ((x) & GENMASK(5, 0))
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_LEV_M                  GENMASK(5, 0)
+
+#define HSIO_S6G_DFT_STATUS_PRBS_SYNC_STAT                BIT(8)
+#define HSIO_S6G_DFT_STATUS_PLL_BIST_NOT_DONE             BIT(7)
+#define HSIO_S6G_DFT_STATUS_PLL_BIST_FAILED               BIT(6)
+#define HSIO_S6G_DFT_STATUS_PLL_BIST_TIMEOUT_ERR          BIT(5)
+#define HSIO_S6G_DFT_STATUS_BIST_ACTIVE                   BIT(3)
+#define HSIO_S6G_DFT_STATUS_BIST_NOSYNC                   BIT(2)
+#define HSIO_S6G_DFT_STATUS_BIST_COMPLETE_N               BIT(1)
+#define HSIO_S6G_DFT_STATUS_BIST_ERROR                    BIT(0)
+
+#define HSIO_S6G_MISC_STATUS_DES_100FX_PHASE_SEL          BIT(0)
+
+#define HSIO_S6G_DES_CFG_DES_PHS_CTRL(x)                  (((x) << 13) & GENMASK(16, 13))
+#define HSIO_S6G_DES_CFG_DES_PHS_CTRL_M                   GENMASK(16, 13)
+#define HSIO_S6G_DES_CFG_DES_PHS_CTRL_X(x)                (((x) & GENMASK(16, 13)) >> 13)
+#define HSIO_S6G_DES_CFG_DES_MBTR_CTRL(x)                 (((x) << 10) & GENMASK(12, 10))
+#define HSIO_S6G_DES_CFG_DES_MBTR_CTRL_M                  GENMASK(12, 10)
+#define HSIO_S6G_DES_CFG_DES_MBTR_CTRL_X(x)               (((x) & GENMASK(12, 10)) >> 10)
+#define HSIO_S6G_DES_CFG_DES_CPMD_SEL(x)                  (((x) << 8) & GENMASK(9, 8))
+#define HSIO_S6G_DES_CFG_DES_CPMD_SEL_M                   GENMASK(9, 8)
+#define HSIO_S6G_DES_CFG_DES_CPMD_SEL_X(x)                (((x) & GENMASK(9, 8)) >> 8)
+#define HSIO_S6G_DES_CFG_DES_BW_HYST(x)                   (((x) << 5) & GENMASK(7, 5))
+#define HSIO_S6G_DES_CFG_DES_BW_HYST_M                    GENMASK(7, 5)
+#define HSIO_S6G_DES_CFG_DES_BW_HYST_X(x)                 (((x) & GENMASK(7, 5)) >> 5)
+#define HSIO_S6G_DES_CFG_DES_SWAP_HYST                    BIT(4)
+#define HSIO_S6G_DES_CFG_DES_BW_ANA(x)                    (((x) << 1) & GENMASK(3, 1))
+#define HSIO_S6G_DES_CFG_DES_BW_ANA_M                     GENMASK(3, 1)
+#define HSIO_S6G_DES_CFG_DES_BW_ANA_X(x)                  (((x) & GENMASK(3, 1)) >> 1)
+#define HSIO_S6G_DES_CFG_DES_SWAP_ANA                     BIT(0)
+
+#define HSIO_S6G_IB_CFG_IB_SOFSI(x)                       (((x) << 29) & GENMASK(30, 29))
+#define HSIO_S6G_IB_CFG_IB_SOFSI_M                        GENMASK(30, 29)
+#define HSIO_S6G_IB_CFG_IB_SOFSI_X(x)                     (((x) & GENMASK(30, 29)) >> 29)
+#define HSIO_S6G_IB_CFG_IB_VBULK_SEL                      BIT(28)
+#define HSIO_S6G_IB_CFG_IB_RTRM_ADJ(x)                    (((x) << 24) & GENMASK(27, 24))
+#define HSIO_S6G_IB_CFG_IB_RTRM_ADJ_M                     GENMASK(27, 24)
+#define HSIO_S6G_IB_CFG_IB_RTRM_ADJ_X(x)                  (((x) & GENMASK(27, 24)) >> 24)
+#define HSIO_S6G_IB_CFG_IB_ICML_ADJ(x)                    (((x) << 20) & GENMASK(23, 20))
+#define HSIO_S6G_IB_CFG_IB_ICML_ADJ_M                     GENMASK(23, 20)
+#define HSIO_S6G_IB_CFG_IB_ICML_ADJ_X(x)                  (((x) & GENMASK(23, 20)) >> 20)
+#define HSIO_S6G_IB_CFG_IB_TERM_MODE_SEL(x)               (((x) << 18) & GENMASK(19, 18))
+#define HSIO_S6G_IB_CFG_IB_TERM_MODE_SEL_M                GENMASK(19, 18)
+#define HSIO_S6G_IB_CFG_IB_TERM_MODE_SEL_X(x)             (((x) & GENMASK(19, 18)) >> 18)
+#define HSIO_S6G_IB_CFG_IB_SIG_DET_CLK_SEL(x)             (((x) << 15) & GENMASK(17, 15))
+#define HSIO_S6G_IB_CFG_IB_SIG_DET_CLK_SEL_M              GENMASK(17, 15)
+#define HSIO_S6G_IB_CFG_IB_SIG_DET_CLK_SEL_X(x)           (((x) & GENMASK(17, 15)) >> 15)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_HP(x)              (((x) << 13) & GENMASK(14, 13))
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_HP_M               GENMASK(14, 13)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_HP_X(x)            (((x) & GENMASK(14, 13)) >> 13)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_MID(x)             (((x) << 11) & GENMASK(12, 11))
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_MID_M              GENMASK(12, 11)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_MID_X(x)           (((x) & GENMASK(12, 11)) >> 11)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_LP(x)              (((x) << 9) & GENMASK(10, 9))
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_LP_M               GENMASK(10, 9)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_LP_X(x)            (((x) & GENMASK(10, 9)) >> 9)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_OFFSET(x)          (((x) << 7) & GENMASK(8, 7))
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_OFFSET_M           GENMASK(8, 7)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_OFFSET_X(x)        (((x) & GENMASK(8, 7)) >> 7)
+#define HSIO_S6G_IB_CFG_IB_ANA_TEST_ENA                   BIT(6)
+#define HSIO_S6G_IB_CFG_IB_SIG_DET_ENA                    BIT(5)
+#define HSIO_S6G_IB_CFG_IB_CONCUR                         BIT(4)
+#define HSIO_S6G_IB_CFG_IB_CAL_ENA                        BIT(3)
+#define HSIO_S6G_IB_CFG_IB_SAM_ENA                        BIT(2)
+#define HSIO_S6G_IB_CFG_IB_EQZ_ENA                        BIT(1)
+#define HSIO_S6G_IB_CFG_IB_REG_ENA                        BIT(0)
+
+#define HSIO_S6G_IB_CFG1_IB_TJTAG(x)                      (((x) << 17) & GENMASK(21, 17))
+#define HSIO_S6G_IB_CFG1_IB_TJTAG_M                       GENMASK(21, 17)
+#define HSIO_S6G_IB_CFG1_IB_TJTAG_X(x)                    (((x) & GENMASK(21, 17)) >> 17)
+#define HSIO_S6G_IB_CFG1_IB_TSDET(x)                      (((x) << 12) & GENMASK(16, 12))
+#define HSIO_S6G_IB_CFG1_IB_TSDET_M                       GENMASK(16, 12)
+#define HSIO_S6G_IB_CFG1_IB_TSDET_X(x)                    (((x) & GENMASK(16, 12)) >> 12)
+#define HSIO_S6G_IB_CFG1_IB_SCALY(x)                      (((x) << 8) & GENMASK(11, 8))
+#define HSIO_S6G_IB_CFG1_IB_SCALY_M                       GENMASK(11, 8)
+#define HSIO_S6G_IB_CFG1_IB_SCALY_X(x)                    (((x) & GENMASK(11, 8)) >> 8)
+#define HSIO_S6G_IB_CFG1_IB_FILT_HP                       BIT(7)
+#define HSIO_S6G_IB_CFG1_IB_FILT_MID                      BIT(6)
+#define HSIO_S6G_IB_CFG1_IB_FILT_LP                       BIT(5)
+#define HSIO_S6G_IB_CFG1_IB_FILT_OFFSET                   BIT(4)
+#define HSIO_S6G_IB_CFG1_IB_FRC_HP                        BIT(3)
+#define HSIO_S6G_IB_CFG1_IB_FRC_MID                       BIT(2)
+#define HSIO_S6G_IB_CFG1_IB_FRC_LP                        BIT(1)
+#define HSIO_S6G_IB_CFG1_IB_FRC_OFFSET                    BIT(0)
+
+#define HSIO_S6G_IB_CFG2_IB_TINFV(x)                      (((x) << 27) & GENMASK(29, 27))
+#define HSIO_S6G_IB_CFG2_IB_TINFV_M                       GENMASK(29, 27)
+#define HSIO_S6G_IB_CFG2_IB_TINFV_X(x)                    (((x) & GENMASK(29, 27)) >> 27)
+#define HSIO_S6G_IB_CFG2_IB_OINFI(x)                      (((x) << 22) & GENMASK(26, 22))
+#define HSIO_S6G_IB_CFG2_IB_OINFI_M                       GENMASK(26, 22)
+#define HSIO_S6G_IB_CFG2_IB_OINFI_X(x)                    (((x) & GENMASK(26, 22)) >> 22)
+#define HSIO_S6G_IB_CFG2_IB_TAUX(x)                       (((x) << 19) & GENMASK(21, 19))
+#define HSIO_S6G_IB_CFG2_IB_TAUX_M                        GENMASK(21, 19)
+#define HSIO_S6G_IB_CFG2_IB_TAUX_X(x)                     (((x) & GENMASK(21, 19)) >> 19)
+#define HSIO_S6G_IB_CFG2_IB_OINFS(x)                      (((x) << 16) & GENMASK(18, 16))
+#define HSIO_S6G_IB_CFG2_IB_OINFS_M                       GENMASK(18, 16)
+#define HSIO_S6G_IB_CFG2_IB_OINFS_X(x)                    (((x) & GENMASK(18, 16)) >> 16)
+#define HSIO_S6G_IB_CFG2_IB_OCALS(x)                      (((x) << 10) & GENMASK(15, 10))
+#define HSIO_S6G_IB_CFG2_IB_OCALS_M                       GENMASK(15, 10)
+#define HSIO_S6G_IB_CFG2_IB_OCALS_X(x)                    (((x) & GENMASK(15, 10)) >> 10)
+#define HSIO_S6G_IB_CFG2_IB_TCALV(x)                      (((x) << 5) & GENMASK(9, 5))
+#define HSIO_S6G_IB_CFG2_IB_TCALV_M                       GENMASK(9, 5)
+#define HSIO_S6G_IB_CFG2_IB_TCALV_X(x)                    (((x) & GENMASK(9, 5)) >> 5)
+#define HSIO_S6G_IB_CFG2_IB_UMAX(x)                       (((x) << 3) & GENMASK(4, 3))
+#define HSIO_S6G_IB_CFG2_IB_UMAX_M                        GENMASK(4, 3)
+#define HSIO_S6G_IB_CFG2_IB_UMAX_X(x)                     (((x) & GENMASK(4, 3)) >> 3)
+#define HSIO_S6G_IB_CFG2_IB_UREG(x)                       ((x) & GENMASK(2, 0))
+#define HSIO_S6G_IB_CFG2_IB_UREG_M                        GENMASK(2, 0)
+
+#define HSIO_S6G_IB_CFG3_IB_INI_HP(x)                     (((x) << 18) & GENMASK(23, 18))
+#define HSIO_S6G_IB_CFG3_IB_INI_HP_M                      GENMASK(23, 18)
+#define HSIO_S6G_IB_CFG3_IB_INI_HP_X(x)                   (((x) & GENMASK(23, 18)) >> 18)
+#define HSIO_S6G_IB_CFG3_IB_INI_MID(x)                    (((x) << 12) & GENMASK(17, 12))
+#define HSIO_S6G_IB_CFG3_IB_INI_MID_M                     GENMASK(17, 12)
+#define HSIO_S6G_IB_CFG3_IB_INI_MID_X(x)                  (((x) & GENMASK(17, 12)) >> 12)
+#define HSIO_S6G_IB_CFG3_IB_INI_LP(x)                     (((x) << 6) & GENMASK(11, 6))
+#define HSIO_S6G_IB_CFG3_IB_INI_LP_M                      GENMASK(11, 6)
+#define HSIO_S6G_IB_CFG3_IB_INI_LP_X(x)                   (((x) & GENMASK(11, 6)) >> 6)
+#define HSIO_S6G_IB_CFG3_IB_INI_OFFSET(x)                 ((x) & GENMASK(5, 0))
+#define HSIO_S6G_IB_CFG3_IB_INI_OFFSET_M                  GENMASK(5, 0)
+
+#define HSIO_S6G_IB_CFG4_IB_MAX_HP(x)                     (((x) << 18) & GENMASK(23, 18))
+#define HSIO_S6G_IB_CFG4_IB_MAX_HP_M                      GENMASK(23, 18)
+#define HSIO_S6G_IB_CFG4_IB_MAX_HP_X(x)                   (((x) & GENMASK(23, 18)) >> 18)
+#define HSIO_S6G_IB_CFG4_IB_MAX_MID(x)                    (((x) << 12) & GENMASK(17, 12))
+#define HSIO_S6G_IB_CFG4_IB_MAX_MID_M                     GENMASK(17, 12)
+#define HSIO_S6G_IB_CFG4_IB_MAX_MID_X(x)                  (((x) & GENMASK(17, 12)) >> 12)
+#define HSIO_S6G_IB_CFG4_IB_MAX_LP(x)                     (((x) << 6) & GENMASK(11, 6))
+#define HSIO_S6G_IB_CFG4_IB_MAX_LP_M                      GENMASK(11, 6)
+#define HSIO_S6G_IB_CFG4_IB_MAX_LP_X(x)                   (((x) & GENMASK(11, 6)) >> 6)
+#define HSIO_S6G_IB_CFG4_IB_MAX_OFFSET(x)                 ((x) & GENMASK(5, 0))
+#define HSIO_S6G_IB_CFG4_IB_MAX_OFFSET_M                  GENMASK(5, 0)
+
+#define HSIO_S6G_IB_CFG5_IB_MIN_HP(x)                     (((x) << 18) & GENMASK(23, 18))
+#define HSIO_S6G_IB_CFG5_IB_MIN_HP_M                      GENMASK(23, 18)
+#define HSIO_S6G_IB_CFG5_IB_MIN_HP_X(x)                   (((x) & GENMASK(23, 18)) >> 18)
+#define HSIO_S6G_IB_CFG5_IB_MIN_MID(x)                    (((x) << 12) & GENMASK(17, 12))
+#define HSIO_S6G_IB_CFG5_IB_MIN_MID_M                     GENMASK(17, 12)
+#define HSIO_S6G_IB_CFG5_IB_MIN_MID_X(x)                  (((x) & GENMASK(17, 12)) >> 12)
+#define HSIO_S6G_IB_CFG5_IB_MIN_LP(x)                     (((x) << 6) & GENMASK(11, 6))
+#define HSIO_S6G_IB_CFG5_IB_MIN_LP_M                      GENMASK(11, 6)
+#define HSIO_S6G_IB_CFG5_IB_MIN_LP_X(x)                   (((x) & GENMASK(11, 6)) >> 6)
+#define HSIO_S6G_IB_CFG5_IB_MIN_OFFSET(x)                 ((x) & GENMASK(5, 0))
+#define HSIO_S6G_IB_CFG5_IB_MIN_OFFSET_M                  GENMASK(5, 0)
+
+#define HSIO_S6G_OB_CFG_OB_IDLE                           BIT(31)
+#define HSIO_S6G_OB_CFG_OB_ENA1V_MODE                     BIT(30)
+#define HSIO_S6G_OB_CFG_OB_POL                            BIT(29)
+#define HSIO_S6G_OB_CFG_OB_POST0(x)                       (((x) << 23) & GENMASK(28, 23))
+#define HSIO_S6G_OB_CFG_OB_POST0_M                        GENMASK(28, 23)
+#define HSIO_S6G_OB_CFG_OB_POST0_X(x)                     (((x) & GENMASK(28, 23)) >> 23)
+#define HSIO_S6G_OB_CFG_OB_PREC(x)                        (((x) << 18) & GENMASK(22, 18))
+#define HSIO_S6G_OB_CFG_OB_PREC_M                         GENMASK(22, 18)
+#define HSIO_S6G_OB_CFG_OB_PREC_X(x)                      (((x) & GENMASK(22, 18)) >> 18)
+#define HSIO_S6G_OB_CFG_OB_R_ADJ_MUX                      BIT(17)
+#define HSIO_S6G_OB_CFG_OB_R_ADJ_PDR                      BIT(16)
+#define HSIO_S6G_OB_CFG_OB_POST1(x)                       (((x) << 11) & GENMASK(15, 11))
+#define HSIO_S6G_OB_CFG_OB_POST1_M                        GENMASK(15, 11)
+#define HSIO_S6G_OB_CFG_OB_POST1_X(x)                     (((x) & GENMASK(15, 11)) >> 11)
+#define HSIO_S6G_OB_CFG_OB_R_COR                          BIT(10)
+#define HSIO_S6G_OB_CFG_OB_SEL_RCTRL                      BIT(9)
+#define HSIO_S6G_OB_CFG_OB_SR_H                           BIT(8)
+#define HSIO_S6G_OB_CFG_OB_SR(x)                          (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S6G_OB_CFG_OB_SR_M                           GENMASK(7, 4)
+#define HSIO_S6G_OB_CFG_OB_SR_X(x)                        (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S6G_OB_CFG_OB_RESISTOR_CTRL(x)               ((x) & GENMASK(3, 0))
+#define HSIO_S6G_OB_CFG_OB_RESISTOR_CTRL_M                GENMASK(3, 0)
+
+#define HSIO_S6G_OB_CFG1_OB_ENA_CAS(x)                    (((x) << 6) & GENMASK(8, 6))
+#define HSIO_S6G_OB_CFG1_OB_ENA_CAS_M                     GENMASK(8, 6)
+#define HSIO_S6G_OB_CFG1_OB_ENA_CAS_X(x)                  (((x) & GENMASK(8, 6)) >> 6)
+#define HSIO_S6G_OB_CFG1_OB_LEV(x)                        ((x) & GENMASK(5, 0))
+#define HSIO_S6G_OB_CFG1_OB_LEV_M                         GENMASK(5, 0)
+
+#define HSIO_S6G_SER_CFG_SER_4TAP_ENA                     BIT(8)
+#define HSIO_S6G_SER_CFG_SER_CPMD_SEL                     BIT(7)
+#define HSIO_S6G_SER_CFG_SER_SWAP_CPMD                    BIT(6)
+#define HSIO_S6G_SER_CFG_SER_ALISEL(x)                    (((x) << 4) & GENMASK(5, 4))
+#define HSIO_S6G_SER_CFG_SER_ALISEL_M                     GENMASK(5, 4)
+#define HSIO_S6G_SER_CFG_SER_ALISEL_X(x)                  (((x) & GENMASK(5, 4)) >> 4)
+#define HSIO_S6G_SER_CFG_SER_ENHYS                        BIT(3)
+#define HSIO_S6G_SER_CFG_SER_BIG_WIN                      BIT(2)
+#define HSIO_S6G_SER_CFG_SER_EN_WIN                       BIT(1)
+#define HSIO_S6G_SER_CFG_SER_ENALI                        BIT(0)
+
+#define HSIO_S6G_COMMON_CFG_SYS_RST                       BIT(17)
+#define HSIO_S6G_COMMON_CFG_SE_DIV2_ENA                   BIT(16)
+#define HSIO_S6G_COMMON_CFG_SE_AUTO_SQUELCH_ENA           BIT(15)
+#define HSIO_S6G_COMMON_CFG_ENA_LANE                      BIT(14)
+#define HSIO_S6G_COMMON_CFG_PWD_RX                        BIT(13)
+#define HSIO_S6G_COMMON_CFG_PWD_TX                        BIT(12)
+#define HSIO_S6G_COMMON_CFG_LANE_CTRL(x)                  (((x) << 9) & GENMASK(11, 9))
+#define HSIO_S6G_COMMON_CFG_LANE_CTRL_M                   GENMASK(11, 9)
+#define HSIO_S6G_COMMON_CFG_LANE_CTRL_X(x)                (((x) & GENMASK(11, 9)) >> 9)
+#define HSIO_S6G_COMMON_CFG_ENA_DIRECT                    BIT(8)
+#define HSIO_S6G_COMMON_CFG_ENA_ELOOP                     BIT(7)
+#define HSIO_S6G_COMMON_CFG_ENA_FLOOP                     BIT(6)
+#define HSIO_S6G_COMMON_CFG_ENA_ILOOP                     BIT(5)
+#define HSIO_S6G_COMMON_CFG_ENA_PLOOP                     BIT(4)
+#define HSIO_S6G_COMMON_CFG_HRATE                         BIT(3)
+#define HSIO_S6G_COMMON_CFG_QRATE                         BIT(2)
+#define HSIO_S6G_COMMON_CFG_IF_MODE(x)                    ((x) & GENMASK(1, 0))
+#define HSIO_S6G_COMMON_CFG_IF_MODE_M                     GENMASK(1, 0)
+
+#define HSIO_S6G_PLL_CFG_PLL_ENA_OFFS(x)                  (((x) << 16) & GENMASK(17, 16))
+#define HSIO_S6G_PLL_CFG_PLL_ENA_OFFS_M                   GENMASK(17, 16)
+#define HSIO_S6G_PLL_CFG_PLL_ENA_OFFS_X(x)                (((x) & GENMASK(17, 16)) >> 16)
+#define HSIO_S6G_PLL_CFG_PLL_DIV4                         BIT(15)
+#define HSIO_S6G_PLL_CFG_PLL_ENA_ROT                      BIT(14)
+#define HSIO_S6G_PLL_CFG_PLL_FSM_CTRL_DATA(x)             (((x) << 6) & GENMASK(13, 6))
+#define HSIO_S6G_PLL_CFG_PLL_FSM_CTRL_DATA_M              GENMASK(13, 6)
+#define HSIO_S6G_PLL_CFG_PLL_FSM_CTRL_DATA_X(x)           (((x) & GENMASK(13, 6)) >> 6)
+#define HSIO_S6G_PLL_CFG_PLL_FSM_ENA                      BIT(5)
+#define HSIO_S6G_PLL_CFG_PLL_FSM_FORCE_SET_ENA            BIT(4)
+#define HSIO_S6G_PLL_CFG_PLL_FSM_OOR_RECAL_ENA            BIT(3)
+#define HSIO_S6G_PLL_CFG_PLL_RB_DATA_SEL                  BIT(2)
+#define HSIO_S6G_PLL_CFG_PLL_ROT_DIR                      BIT(1)
+#define HSIO_S6G_PLL_CFG_PLL_ROT_FRQ                      BIT(0)
+
+#define HSIO_S6G_ACJTAG_CFG_ACJTAG_INIT_DATA_N            BIT(5)
+#define HSIO_S6G_ACJTAG_CFG_ACJTAG_INIT_DATA_P            BIT(4)
+#define HSIO_S6G_ACJTAG_CFG_ACJTAG_INIT_CLK               BIT(3)
+#define HSIO_S6G_ACJTAG_CFG_OB_DIRECT                     BIT(2)
+#define HSIO_S6G_ACJTAG_CFG_ACJTAG_ENA                    BIT(1)
+#define HSIO_S6G_ACJTAG_CFG_JTAG_CTRL_ENA                 BIT(0)
+
+#define HSIO_S6G_GP_CFG_GP_MSB(x)                         (((x) << 16) & GENMASK(31, 16))
+#define HSIO_S6G_GP_CFG_GP_MSB_M                          GENMASK(31, 16)
+#define HSIO_S6G_GP_CFG_GP_MSB_X(x)                       (((x) & GENMASK(31, 16)) >> 16)
+#define HSIO_S6G_GP_CFG_GP_LSB(x)                         ((x) & GENMASK(15, 0))
+#define HSIO_S6G_GP_CFG_GP_LSB_M                          GENMASK(15, 0)
+
+#define HSIO_S6G_IB_STATUS0_IB_CAL_DONE                   BIT(8)
+#define HSIO_S6G_IB_STATUS0_IB_HP_GAIN_ACT                BIT(7)
+#define HSIO_S6G_IB_STATUS0_IB_MID_GAIN_ACT               BIT(6)
+#define HSIO_S6G_IB_STATUS0_IB_LP_GAIN_ACT                BIT(5)
+#define HSIO_S6G_IB_STATUS0_IB_OFFSET_ACT                 BIT(4)
+#define HSIO_S6G_IB_STATUS0_IB_OFFSET_VLD                 BIT(3)
+#define HSIO_S6G_IB_STATUS0_IB_OFFSET_ERR                 BIT(2)
+#define HSIO_S6G_IB_STATUS0_IB_OFFSDIR                    BIT(1)
+#define HSIO_S6G_IB_STATUS0_IB_SIG_DET                    BIT(0)
+
+#define HSIO_S6G_IB_STATUS1_IB_HP_GAIN_STAT(x)            (((x) << 18) & GENMASK(23, 18))
+#define HSIO_S6G_IB_STATUS1_IB_HP_GAIN_STAT_M             GENMASK(23, 18)
+#define HSIO_S6G_IB_STATUS1_IB_HP_GAIN_STAT_X(x)          (((x) & GENMASK(23, 18)) >> 18)
+#define HSIO_S6G_IB_STATUS1_IB_MID_GAIN_STAT(x)           (((x) << 12) & GENMASK(17, 12))
+#define HSIO_S6G_IB_STATUS1_IB_MID_GAIN_STAT_M            GENMASK(17, 12)
+#define HSIO_S6G_IB_STATUS1_IB_MID_GAIN_STAT_X(x)         (((x) & GENMASK(17, 12)) >> 12)
+#define HSIO_S6G_IB_STATUS1_IB_LP_GAIN_STAT(x)            (((x) << 6) & GENMASK(11, 6))
+#define HSIO_S6G_IB_STATUS1_IB_LP_GAIN_STAT_M             GENMASK(11, 6)
+#define HSIO_S6G_IB_STATUS1_IB_LP_GAIN_STAT_X(x)          (((x) & GENMASK(11, 6)) >> 6)
+#define HSIO_S6G_IB_STATUS1_IB_OFFSET_STAT(x)             ((x) & GENMASK(5, 0))
+#define HSIO_S6G_IB_STATUS1_IB_OFFSET_STAT_M              GENMASK(5, 0)
+
+#define HSIO_S6G_ACJTAG_STATUS_ACJTAG_CAPT_DATA_N         BIT(2)
+#define HSIO_S6G_ACJTAG_STATUS_ACJTAG_CAPT_DATA_P         BIT(1)
+#define HSIO_S6G_ACJTAG_STATUS_IB_DIRECT                  BIT(0)
+
+#define HSIO_S6G_PLL_STATUS_PLL_CAL_NOT_DONE              BIT(10)
+#define HSIO_S6G_PLL_STATUS_PLL_CAL_ERR                   BIT(9)
+#define HSIO_S6G_PLL_STATUS_PLL_OUT_OF_RANGE_ERR          BIT(8)
+#define HSIO_S6G_PLL_STATUS_PLL_RB_DATA(x)                ((x) & GENMASK(7, 0))
+#define HSIO_S6G_PLL_STATUS_PLL_RB_DATA_M                 GENMASK(7, 0)
+
+#define HSIO_S6G_REVID_SERDES_REV(x)                      (((x) << 26) & GENMASK(31, 26))
+#define HSIO_S6G_REVID_SERDES_REV_M                       GENMASK(31, 26)
+#define HSIO_S6G_REVID_SERDES_REV_X(x)                    (((x) & GENMASK(31, 26)) >> 26)
+#define HSIO_S6G_REVID_RCPLL_REV(x)                       (((x) << 21) & GENMASK(25, 21))
+#define HSIO_S6G_REVID_RCPLL_REV_M                        GENMASK(25, 21)
+#define HSIO_S6G_REVID_RCPLL_REV_X(x)                     (((x) & GENMASK(25, 21)) >> 21)
+#define HSIO_S6G_REVID_SER_REV(x)                         (((x) << 16) & GENMASK(20, 16))
+#define HSIO_S6G_REVID_SER_REV_M                          GENMASK(20, 16)
+#define HSIO_S6G_REVID_SER_REV_X(x)                       (((x) & GENMASK(20, 16)) >> 16)
+#define HSIO_S6G_REVID_DES_REV(x)                         (((x) << 10) & GENMASK(15, 10))
+#define HSIO_S6G_REVID_DES_REV_M                          GENMASK(15, 10)
+#define HSIO_S6G_REVID_DES_REV_X(x)                       (((x) & GENMASK(15, 10)) >> 10)
+#define HSIO_S6G_REVID_OB_REV(x)                          (((x) << 5) & GENMASK(9, 5))
+#define HSIO_S6G_REVID_OB_REV_M                           GENMASK(9, 5)
+#define HSIO_S6G_REVID_OB_REV_X(x)                        (((x) & GENMASK(9, 5)) >> 5)
+#define HSIO_S6G_REVID_IB_REV(x)                          ((x) & GENMASK(4, 0))
+#define HSIO_S6G_REVID_IB_REV_M                           GENMASK(4, 0)
+
+#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_WR_ONE_SHOT        BIT(31)
+#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_RD_ONE_SHOT        BIT(30)
+#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_ADDR(x)            ((x) & GENMASK(24, 0))
+#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_ADDR_M             GENMASK(24, 0)
+
+#define HSIO_HW_CFG_DEV2G5_10_MODE                        BIT(6)
+#define HSIO_HW_CFG_DEV1G_9_MODE                          BIT(5)
+#define HSIO_HW_CFG_DEV1G_6_MODE                          BIT(4)
+#define HSIO_HW_CFG_DEV1G_5_MODE                          BIT(3)
+#define HSIO_HW_CFG_DEV1G_4_MODE                          BIT(2)
+#define HSIO_HW_CFG_PCIE_ENA                              BIT(1)
+#define HSIO_HW_CFG_QSGMII_ENA                            BIT(0)
+
+#define HSIO_HW_QSGMII_CFG_SHYST_DIS                      BIT(3)
+#define HSIO_HW_QSGMII_CFG_E_DET_ENA                      BIT(2)
+#define HSIO_HW_QSGMII_CFG_USE_I1_ENA                     BIT(1)
+#define HSIO_HW_QSGMII_CFG_FLIP_LANES                     BIT(0)
+
+#define HSIO_HW_QSGMII_STAT_DELAY_VAR_X200PS(x)           (((x) << 1) & GENMASK(6, 1))
+#define HSIO_HW_QSGMII_STAT_DELAY_VAR_X200PS_M            GENMASK(6, 1)
+#define HSIO_HW_QSGMII_STAT_DELAY_VAR_X200PS_X(x)         (((x) & GENMASK(6, 1)) >> 1)
+#define HSIO_HW_QSGMII_STAT_SYNC                          BIT(0)
+
+#define HSIO_CLK_CFG_CLKDIV_PHY(x)                        (((x) << 1) & GENMASK(8, 1))
+#define HSIO_CLK_CFG_CLKDIV_PHY_M                         GENMASK(8, 1)
+#define HSIO_CLK_CFG_CLKDIV_PHY_X(x)                      (((x) & GENMASK(8, 1)) >> 1)
+#define HSIO_CLK_CFG_CLKDIV_PHY_DIS                       BIT(0)
+
+#define HSIO_TEMP_SENSOR_CTRL_FORCE_TEMP_RD               BIT(5)
+#define HSIO_TEMP_SENSOR_CTRL_FORCE_RUN                   BIT(4)
+#define HSIO_TEMP_SENSOR_CTRL_FORCE_NO_RST                BIT(3)
+#define HSIO_TEMP_SENSOR_CTRL_FORCE_POWER_UP              BIT(2)
+#define HSIO_TEMP_SENSOR_CTRL_FORCE_CLK                   BIT(1)
+#define HSIO_TEMP_SENSOR_CTRL_SAMPLE_ENA                  BIT(0)
+
+#define HSIO_TEMP_SENSOR_CFG_RUN_WID(x)                   (((x) << 8) & GENMASK(15, 8))
+#define HSIO_TEMP_SENSOR_CFG_RUN_WID_M                    GENMASK(15, 8)
+#define HSIO_TEMP_SENSOR_CFG_RUN_WID_X(x)                 (((x) & GENMASK(15, 8)) >> 8)
+#define HSIO_TEMP_SENSOR_CFG_SAMPLE_PER(x)                ((x) & GENMASK(7, 0))
+#define HSIO_TEMP_SENSOR_CFG_SAMPLE_PER_M                 GENMASK(7, 0)
+
+#define HSIO_TEMP_SENSOR_STAT_TEMP_VALID                  BIT(8)
+#define HSIO_TEMP_SENSOR_STAT_TEMP(x)                     ((x) & GENMASK(7, 0))
+#define HSIO_TEMP_SENSOR_STAT_TEMP_M                      GENMASK(7, 0)
+
+#endif
-- 
cgit v1.2.3


From 66c213233308ac94fabed80b09041eb14f8d787b Mon Sep 17 00:00:00 2001
From: Quentin Schulz <quentin.schulz@bootlin.com>
Date: Thu, 4 Oct 2018 14:22:02 +0200
Subject: net: mscc: ocelot: simplify register access for PLL5 configuration

Since HSIO address space can be accessed by different drivers, let's
simplify the register address definitions so that it can be easily used
by all drivers and put the register address definition in the
include/soc/mscc/ocelot_hsio.h header file.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.h      | 73 --------------------------
 drivers/net/ethernet/mscc/ocelot_regs.c | 92 ++++-----------------------------
 include/soc/mscc/ocelot_hsio.h          | 74 ++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 156 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index d3980158c4a3..ff0e3a5d7487 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -333,79 +333,6 @@ enum ocelot_reg {
 	SYS_CM_DATA_RD,
 	SYS_CM_OP,
 	SYS_CM_DATA,
-	HSIO_PLL5G_CFG0 = HSIO << TARGET_OFFSET,
-	HSIO_PLL5G_CFG1,
-	HSIO_PLL5G_CFG2,
-	HSIO_PLL5G_CFG3,
-	HSIO_PLL5G_CFG4,
-	HSIO_PLL5G_CFG5,
-	HSIO_PLL5G_CFG6,
-	HSIO_PLL5G_STATUS0,
-	HSIO_PLL5G_STATUS1,
-	HSIO_PLL5G_BIST_CFG0,
-	HSIO_PLL5G_BIST_CFG1,
-	HSIO_PLL5G_BIST_CFG2,
-	HSIO_PLL5G_BIST_STAT0,
-	HSIO_PLL5G_BIST_STAT1,
-	HSIO_RCOMP_CFG0,
-	HSIO_RCOMP_STATUS,
-	HSIO_SYNC_ETH_CFG,
-	HSIO_SYNC_ETH_PLL_CFG,
-	HSIO_S1G_DES_CFG,
-	HSIO_S1G_IB_CFG,
-	HSIO_S1G_OB_CFG,
-	HSIO_S1G_SER_CFG,
-	HSIO_S1G_COMMON_CFG,
-	HSIO_S1G_PLL_CFG,
-	HSIO_S1G_PLL_STATUS,
-	HSIO_S1G_DFT_CFG0,
-	HSIO_S1G_DFT_CFG1,
-	HSIO_S1G_DFT_CFG2,
-	HSIO_S1G_TP_CFG,
-	HSIO_S1G_RC_PLL_BIST_CFG,
-	HSIO_S1G_MISC_CFG,
-	HSIO_S1G_DFT_STATUS,
-	HSIO_S1G_MISC_STATUS,
-	HSIO_MCB_S1G_ADDR_CFG,
-	HSIO_S6G_DIG_CFG,
-	HSIO_S6G_DFT_CFG0,
-	HSIO_S6G_DFT_CFG1,
-	HSIO_S6G_DFT_CFG2,
-	HSIO_S6G_TP_CFG0,
-	HSIO_S6G_TP_CFG1,
-	HSIO_S6G_RC_PLL_BIST_CFG,
-	HSIO_S6G_MISC_CFG,
-	HSIO_S6G_OB_ANEG_CFG,
-	HSIO_S6G_DFT_STATUS,
-	HSIO_S6G_ERR_CNT,
-	HSIO_S6G_MISC_STATUS,
-	HSIO_S6G_DES_CFG,
-	HSIO_S6G_IB_CFG,
-	HSIO_S6G_IB_CFG1,
-	HSIO_S6G_IB_CFG2,
-	HSIO_S6G_IB_CFG3,
-	HSIO_S6G_IB_CFG4,
-	HSIO_S6G_IB_CFG5,
-	HSIO_S6G_OB_CFG,
-	HSIO_S6G_OB_CFG1,
-	HSIO_S6G_SER_CFG,
-	HSIO_S6G_COMMON_CFG,
-	HSIO_S6G_PLL_CFG,
-	HSIO_S6G_ACJTAG_CFG,
-	HSIO_S6G_GP_CFG,
-	HSIO_S6G_IB_STATUS0,
-	HSIO_S6G_IB_STATUS1,
-	HSIO_S6G_ACJTAG_STATUS,
-	HSIO_S6G_PLL_STATUS,
-	HSIO_S6G_REVID,
-	HSIO_MCB_S6G_ADDR_CFG,
-	HSIO_HW_CFG,
-	HSIO_HW_QSGMII_CFG,
-	HSIO_HW_QSGMII_STAT,
-	HSIO_CLK_CFG,
-	HSIO_TEMP_SENSOR_CTRL,
-	HSIO_TEMP_SENSOR_CFG,
-	HSIO_TEMP_SENSOR_STAT,
 };
 
 enum ocelot_regfield {
diff --git a/drivers/net/ethernet/mscc/ocelot_regs.c b/drivers/net/ethernet/mscc/ocelot_regs.c
index e334b406c40c..2518ce0fe265 100644
--- a/drivers/net/ethernet/mscc/ocelot_regs.c
+++ b/drivers/net/ethernet/mscc/ocelot_regs.c
@@ -102,82 +102,6 @@ static const u32 ocelot_qs_regmap[] = {
 	REG(QS_INH_DBG,                    0x000048),
 };
 
-static const u32 ocelot_hsio_regmap[] = {
-	REG(HSIO_PLL5G_CFG0,               0x000000),
-	REG(HSIO_PLL5G_CFG1,               0x000004),
-	REG(HSIO_PLL5G_CFG2,               0x000008),
-	REG(HSIO_PLL5G_CFG3,               0x00000c),
-	REG(HSIO_PLL5G_CFG4,               0x000010),
-	REG(HSIO_PLL5G_CFG5,               0x000014),
-	REG(HSIO_PLL5G_CFG6,               0x000018),
-	REG(HSIO_PLL5G_STATUS0,            0x00001c),
-	REG(HSIO_PLL5G_STATUS1,            0x000020),
-	REG(HSIO_PLL5G_BIST_CFG0,          0x000024),
-	REG(HSIO_PLL5G_BIST_CFG1,          0x000028),
-	REG(HSIO_PLL5G_BIST_CFG2,          0x00002c),
-	REG(HSIO_PLL5G_BIST_STAT0,         0x000030),
-	REG(HSIO_PLL5G_BIST_STAT1,         0x000034),
-	REG(HSIO_RCOMP_CFG0,               0x000038),
-	REG(HSIO_RCOMP_STATUS,             0x00003c),
-	REG(HSIO_SYNC_ETH_CFG,             0x000040),
-	REG(HSIO_SYNC_ETH_PLL_CFG,         0x000048),
-	REG(HSIO_S1G_DES_CFG,              0x00004c),
-	REG(HSIO_S1G_IB_CFG,               0x000050),
-	REG(HSIO_S1G_OB_CFG,               0x000054),
-	REG(HSIO_S1G_SER_CFG,              0x000058),
-	REG(HSIO_S1G_COMMON_CFG,           0x00005c),
-	REG(HSIO_S1G_PLL_CFG,              0x000060),
-	REG(HSIO_S1G_PLL_STATUS,           0x000064),
-	REG(HSIO_S1G_DFT_CFG0,             0x000068),
-	REG(HSIO_S1G_DFT_CFG1,             0x00006c),
-	REG(HSIO_S1G_DFT_CFG2,             0x000070),
-	REG(HSIO_S1G_TP_CFG,               0x000074),
-	REG(HSIO_S1G_RC_PLL_BIST_CFG,      0x000078),
-	REG(HSIO_S1G_MISC_CFG,             0x00007c),
-	REG(HSIO_S1G_DFT_STATUS,           0x000080),
-	REG(HSIO_S1G_MISC_STATUS,          0x000084),
-	REG(HSIO_MCB_S1G_ADDR_CFG,         0x000088),
-	REG(HSIO_S6G_DIG_CFG,              0x00008c),
-	REG(HSIO_S6G_DFT_CFG0,             0x000090),
-	REG(HSIO_S6G_DFT_CFG1,             0x000094),
-	REG(HSIO_S6G_DFT_CFG2,             0x000098),
-	REG(HSIO_S6G_TP_CFG0,              0x00009c),
-	REG(HSIO_S6G_TP_CFG1,              0x0000a0),
-	REG(HSIO_S6G_RC_PLL_BIST_CFG,      0x0000a4),
-	REG(HSIO_S6G_MISC_CFG,             0x0000a8),
-	REG(HSIO_S6G_OB_ANEG_CFG,          0x0000ac),
-	REG(HSIO_S6G_DFT_STATUS,           0x0000b0),
-	REG(HSIO_S6G_ERR_CNT,              0x0000b4),
-	REG(HSIO_S6G_MISC_STATUS,          0x0000b8),
-	REG(HSIO_S6G_DES_CFG,              0x0000bc),
-	REG(HSIO_S6G_IB_CFG,               0x0000c0),
-	REG(HSIO_S6G_IB_CFG1,              0x0000c4),
-	REG(HSIO_S6G_IB_CFG2,              0x0000c8),
-	REG(HSIO_S6G_IB_CFG3,              0x0000cc),
-	REG(HSIO_S6G_IB_CFG4,              0x0000d0),
-	REG(HSIO_S6G_IB_CFG5,              0x0000d4),
-	REG(HSIO_S6G_OB_CFG,               0x0000d8),
-	REG(HSIO_S6G_OB_CFG1,              0x0000dc),
-	REG(HSIO_S6G_SER_CFG,              0x0000e0),
-	REG(HSIO_S6G_COMMON_CFG,           0x0000e4),
-	REG(HSIO_S6G_PLL_CFG,              0x0000e8),
-	REG(HSIO_S6G_ACJTAG_CFG,           0x0000ec),
-	REG(HSIO_S6G_GP_CFG,               0x0000f0),
-	REG(HSIO_S6G_IB_STATUS0,           0x0000f4),
-	REG(HSIO_S6G_IB_STATUS1,           0x0000f8),
-	REG(HSIO_S6G_ACJTAG_STATUS,        0x0000fc),
-	REG(HSIO_S6G_PLL_STATUS,           0x000100),
-	REG(HSIO_S6G_REVID,                0x000104),
-	REG(HSIO_MCB_S6G_ADDR_CFG,         0x000108),
-	REG(HSIO_HW_CFG,                   0x00010c),
-	REG(HSIO_HW_QSGMII_CFG,            0x000110),
-	REG(HSIO_HW_QSGMII_STAT,           0x000114),
-	REG(HSIO_CLK_CFG,                  0x000118),
-	REG(HSIO_TEMP_SENSOR_CTRL,         0x00011c),
-	REG(HSIO_TEMP_SENSOR_CFG,          0x000120),
-	REG(HSIO_TEMP_SENSOR_STAT,         0x000124),
-};
-
 static const u32 ocelot_qsys_regmap[] = {
 	REG(QSYS_PORT_MODE,                0x011200),
 	REG(QSYS_SWITCH_PORT_MODE,         0x011234),
@@ -302,7 +226,6 @@ static const u32 ocelot_sys_regmap[] = {
 static const u32 *ocelot_regmap[] = {
 	[ANA] = ocelot_ana_regmap,
 	[QS] = ocelot_qs_regmap,
-	[HSIO] = ocelot_hsio_regmap,
 	[QSYS] = ocelot_qsys_regmap,
 	[REW] = ocelot_rew_regmap,
 	[SYS] = ocelot_sys_regmap,
@@ -453,9 +376,11 @@ static void ocelot_pll5_init(struct ocelot *ocelot)
 	/* Configure PLL5. This will need a proper CCF driver
 	 * The values are coming from the VTSS API for Ocelot
 	 */
-	ocelot_write(ocelot, HSIO_PLL5G_CFG4_IB_CTRL(0x7600) |
-		     HSIO_PLL5G_CFG4_IB_BIAS_CTRL(0x8), HSIO_PLL5G_CFG4);
-	ocelot_write(ocelot, HSIO_PLL5G_CFG0_CORE_CLK_DIV(0x11) |
+	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG4,
+		     HSIO_PLL5G_CFG4_IB_CTRL(0x7600) |
+		     HSIO_PLL5G_CFG4_IB_BIAS_CTRL(0x8));
+	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG0,
+		     HSIO_PLL5G_CFG0_CORE_CLK_DIV(0x11) |
 		     HSIO_PLL5G_CFG0_CPU_CLK_DIV(2) |
 		     HSIO_PLL5G_CFG0_ENA_BIAS |
 		     HSIO_PLL5G_CFG0_ENA_VCO_BUF |
@@ -465,13 +390,14 @@ static void ocelot_pll5_init(struct ocelot *ocelot)
 		     HSIO_PLL5G_CFG0_SELBGV820(4) |
 		     HSIO_PLL5G_CFG0_DIV4 |
 		     HSIO_PLL5G_CFG0_ENA_CLKTREE |
-		     HSIO_PLL5G_CFG0_ENA_LANE, HSIO_PLL5G_CFG0);
-	ocelot_write(ocelot, HSIO_PLL5G_CFG2_EN_RESET_FRQ_DET |
+		     HSIO_PLL5G_CFG0_ENA_LANE);
+	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG2,
+		     HSIO_PLL5G_CFG2_EN_RESET_FRQ_DET |
 		     HSIO_PLL5G_CFG2_EN_RESET_OVERRUN |
 		     HSIO_PLL5G_CFG2_GAIN_TEST(0x8) |
 		     HSIO_PLL5G_CFG2_ENA_AMPCTRL |
 		     HSIO_PLL5G_CFG2_PWD_AMPCTRL_N |
-		     HSIO_PLL5G_CFG2_AMPC_SEL(0x10), HSIO_PLL5G_CFG2);
+		     HSIO_PLL5G_CFG2_AMPC_SEL(0x10));
 }
 
 int ocelot_chip_init(struct ocelot *ocelot)
diff --git a/include/soc/mscc/ocelot_hsio.h b/include/soc/mscc/ocelot_hsio.h
index d93ddec3931b..43112dd7313a 100644
--- a/include/soc/mscc/ocelot_hsio.h
+++ b/include/soc/mscc/ocelot_hsio.h
@@ -8,6 +8,80 @@
 #ifndef _MSCC_OCELOT_HSIO_H_
 #define _MSCC_OCELOT_HSIO_H_
 
+#define HSIO_PLL5G_CFG0			0x0000
+#define HSIO_PLL5G_CFG1			0x0004
+#define HSIO_PLL5G_CFG2			0x0008
+#define HSIO_PLL5G_CFG3			0x000c
+#define HSIO_PLL5G_CFG4			0x0010
+#define HSIO_PLL5G_CFG5			0x0014
+#define HSIO_PLL5G_CFG6			0x0018
+#define HSIO_PLL5G_STATUS0		0x001c
+#define HSIO_PLL5G_STATUS1		0x0020
+#define HSIO_PLL5G_BIST_CFG0		0x0024
+#define HSIO_PLL5G_BIST_CFG1		0x0028
+#define HSIO_PLL5G_BIST_CFG2		0x002c
+#define HSIO_PLL5G_BIST_STAT0		0x0030
+#define HSIO_PLL5G_BIST_STAT1		0x0034
+#define HSIO_RCOMP_CFG0			0x0038
+#define HSIO_RCOMP_STATUS		0x003c
+#define HSIO_SYNC_ETH_CFG		0x0040
+#define HSIO_SYNC_ETH_PLL_CFG		0x0048
+#define HSIO_S1G_DES_CFG		0x004c
+#define HSIO_S1G_IB_CFG			0x0050
+#define HSIO_S1G_OB_CFG			0x0054
+#define HSIO_S1G_SER_CFG		0x0058
+#define HSIO_S1G_COMMON_CFG		0x005c
+#define HSIO_S1G_PLL_CFG		0x0060
+#define HSIO_S1G_PLL_STATUS		0x0064
+#define HSIO_S1G_DFT_CFG0		0x0068
+#define HSIO_S1G_DFT_CFG1		0x006c
+#define HSIO_S1G_DFT_CFG2		0x0070
+#define HSIO_S1G_TP_CFG			0x0074
+#define HSIO_S1G_RC_PLL_BIST_CFG	0x0078
+#define HSIO_S1G_MISC_CFG		0x007c
+#define HSIO_S1G_DFT_STATUS		0x0080
+#define HSIO_S1G_MISC_STATUS		0x0084
+#define HSIO_MCB_S1G_ADDR_CFG		0x0088
+#define HSIO_S6G_DIG_CFG		0x008c
+#define HSIO_S6G_DFT_CFG0		0x0090
+#define HSIO_S6G_DFT_CFG1		0x0094
+#define HSIO_S6G_DFT_CFG2		0x0098
+#define HSIO_S6G_TP_CFG0		0x009c
+#define HSIO_S6G_TP_CFG1		0x00a0
+#define HSIO_S6G_RC_PLL_BIST_CFG	0x00a4
+#define HSIO_S6G_MISC_CFG		0x00a8
+#define HSIO_S6G_OB_ANEG_CFG		0x00ac
+#define HSIO_S6G_DFT_STATUS		0x00b0
+#define HSIO_S6G_ERR_CNT		0x00b4
+#define HSIO_S6G_MISC_STATUS		0x00b8
+#define HSIO_S6G_DES_CFG		0x00bc
+#define HSIO_S6G_IB_CFG			0x00c0
+#define HSIO_S6G_IB_CFG1		0x00c4
+#define HSIO_S6G_IB_CFG2		0x00c8
+#define HSIO_S6G_IB_CFG3		0x00cc
+#define HSIO_S6G_IB_CFG4		0x00d0
+#define HSIO_S6G_IB_CFG5		0x00d4
+#define HSIO_S6G_OB_CFG			0x00d8
+#define HSIO_S6G_OB_CFG1		0x00dc
+#define HSIO_S6G_SER_CFG		0x00e0
+#define HSIO_S6G_COMMON_CFG		0x00e4
+#define HSIO_S6G_PLL_CFG		0x00e8
+#define HSIO_S6G_ACJTAG_CFG		0x00ec
+#define HSIO_S6G_GP_CFG			0x00f0
+#define HSIO_S6G_IB_STATUS0		0x00f4
+#define HSIO_S6G_IB_STATUS1		0x00f8
+#define HSIO_S6G_ACJTAG_STATUS		0x00fc
+#define HSIO_S6G_PLL_STATUS		0x0100
+#define HSIO_S6G_REVID			0x0104
+#define HSIO_MCB_S6G_ADDR_CFG		0x0108
+#define HSIO_HW_CFG			0x010c
+#define HSIO_HW_QSGMII_CFG		0x0110
+#define HSIO_HW_QSGMII_STAT		0x0114
+#define HSIO_CLK_CFG			0x0118
+#define HSIO_TEMP_SENSOR_CTRL		0x011c
+#define HSIO_TEMP_SENSOR_CFG		0x0120
+#define HSIO_TEMP_SENSOR_STAT		0x0124
+
 #define HSIO_PLL5G_CFG0_ENA_ROT                           BIT(31)
 #define HSIO_PLL5G_CFG0_ENA_LANE                          BIT(30)
 #define HSIO_PLL5G_CFG0_ENA_CLKTREE                       BIT(29)
-- 
cgit v1.2.3


From c2a90025ad09d830c8d8ae69f485eac6aaaa2472 Mon Sep 17 00:00:00 2001
From: Quentin Schulz <quentin.schulz@bootlin.com>
Date: Thu, 4 Oct 2018 14:22:03 +0200
Subject: phy: add QSGMII and PCIE modes

Prepare for upcoming phys that'll handle QSGMII or PCIe.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy/phy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index 9713aebdd348..03b319f89a34 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -37,9 +37,11 @@ enum phy_mode {
 	PHY_MODE_USB_OTG,
 	PHY_MODE_SGMII,
 	PHY_MODE_2500SGMII,
+	PHY_MODE_QSGMII,
 	PHY_MODE_10GKR,
 	PHY_MODE_UFS_HS_A,
 	PHY_MODE_UFS_HS_B,
+	PHY_MODE_PCIE,
 };
 
 /**
-- 
cgit v1.2.3


From b68fc09be48edbc47de1a0f3d42ef8adf6c0ac55 Mon Sep 17 00:00:00 2001
From: Quentin Schulz <quentin.schulz@bootlin.com>
Date: Thu, 4 Oct 2018 14:22:06 +0200
Subject: dt-bindings: add constants for Microsemi Ocelot SerDes driver

The Microsemi Ocelot has multiple SerDes and requires that the SerDes be
muxed accordingly to the hardware representation.

Let's add a constant for each SerDes available in the Microsemi Ocelot.

Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/dt-bindings/phy/phy-ocelot-serdes.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 include/dt-bindings/phy/phy-ocelot-serdes.h

(limited to 'include')

diff --git a/include/dt-bindings/phy/phy-ocelot-serdes.h b/include/dt-bindings/phy/phy-ocelot-serdes.h
new file mode 100644
index 000000000000..bd28f21206f6
--- /dev/null
+++ b/include/dt-bindings/phy/phy-ocelot-serdes.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/* Copyright (c) 2018 Microsemi Corporation */
+#ifndef __PHY_OCELOT_SERDES_H__
+#define __PHY_OCELOT_SERDES_H__
+
+#define SERDES1G(x)	(x)
+#define SERDES1G_MAX	SERDES1G(5)
+#define SERDES6G(x)	(SERDES1G_MAX + 1 + (x))
+#define SERDES6G_MAX	SERDES6G(2)
+#define SERDES_MAX	SERDES6G_MAX
+
+#endif
-- 
cgit v1.2.3


From c941ce9c282cc606e6517356fcc186a9da2b4ab9 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:47 +0100
Subject: bpf: add verifier callback to get stack usage info for offloaded
 progs

In preparation for BPF-to-BPF calls in offloaded programs, add a new
function attribute to the struct bpf_prog_offload_ops so that drivers
supporting eBPF offload can hook at the end of program verification, and
potentially extract information collected by the verifier.

Implement a minimal callback (returning 0) in the drivers providing the
structs, namely netdevsim and nfp.

This will be useful in the nfp driver, in later commits, to extract the
number of subprograms as well as the stack depth for those subprograms.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |  8 +++++++-
 drivers/net/netdevsim/bpf.c                       |  8 +++++++-
 include/linux/bpf.h                               |  1 +
 include/linux/bpf_verifier.h                      |  1 +
 kernel/bpf/offload.c                              | 18 ++++++++++++++++++
 kernel/bpf/verifier.c                             |  3 +++
 6 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index a6e9248669e1..e470489021e3 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -640,6 +640,12 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 	return 0;
 }
 
+static int nfp_bpf_finalize(struct bpf_verifier_env *env)
+{
+	return 0;
+}
+
 const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops = {
-	.insn_hook = nfp_verify_insn,
+	.insn_hook	= nfp_verify_insn,
+	.finalize	= nfp_bpf_finalize,
 };
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 81444208b216..cb3518474f0e 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -86,8 +86,14 @@ nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn)
 	return 0;
 }
 
+static int nsim_bpf_finalize(struct bpf_verifier_env *env)
+{
+	return 0;
+}
+
 static const struct bpf_prog_offload_ops nsim_bpf_analyzer_ops = {
-	.insn_hook = nsim_bpf_verify_insn,
+	.insn_hook	= nsim_bpf_verify_insn,
+	.finalize	= nsim_bpf_finalize,
 };
 
 static bool nsim_xdp_offload_active(struct netdevsim *ns)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 027697b6a22f..9b558713447f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -263,6 +263,7 @@ struct bpf_verifier_ops {
 struct bpf_prog_offload_ops {
 	int (*insn_hook)(struct bpf_verifier_env *env,
 			 int insn_idx, int prev_insn_idx);
+	int (*finalize)(struct bpf_verifier_env *env);
 };
 
 struct bpf_prog_offload {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7b6fd2ab3263..9e8056ec20fa 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -245,5 +245,6 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
 int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
 				 int insn_idx, int prev_insn_idx);
+int bpf_prog_offload_finalize(struct bpf_verifier_env *env);
 
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 177a52436394..8e93c47f0779 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -172,6 +172,24 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
 	return ret;
 }
 
+int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
+{
+	struct bpf_prog_offload *offload;
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	offload = env->prog->aux->offload;
+	if (offload) {
+		if (offload->dev_ops->finalize)
+			ret = offload->dev_ops->finalize(env);
+		else
+			ret = 0;
+	}
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
 static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 {
 	struct bpf_prog_offload *offload = prog->aux->offload;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 73c81bef6ae8..a0454cb299ba 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6309,6 +6309,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		env->cur_state = NULL;
 	}
 
+	if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
+		ret = bpf_prog_offload_finalize(env);
+
 skip_full_check:
 	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
-- 
cgit v1.2.3


From 9494a6c2e4f6ce21a1e6885145171f90c4492131 Mon Sep 17 00:00:00 2001
From: Quentin Schulz <quentin.schulz@bootlin.com>
Date: Mon, 8 Oct 2018 12:14:41 +0200
Subject: dt-bindings: net: vsc8531: add two additional LED modes for VSC8584

The VSC8584 (and most likely other PHYs in the same generation) has two
additional LED modes that can be picked, so let's add them.

Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/dt-bindings/net/mscc-phy-vsc8531.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/net/mscc-phy-vsc8531.h b/include/dt-bindings/net/mscc-phy-vsc8531.h
index 697161f80eb5..9eb2ec2b2ea9 100644
--- a/include/dt-bindings/net/mscc-phy-vsc8531.h
+++ b/include/dt-bindings/net/mscc-phy-vsc8531.h
@@ -18,9 +18,11 @@
 #define VSC8531_LINK_100_1000_ACTIVITY  4
 #define VSC8531_LINK_10_1000_ACTIVITY   5
 #define VSC8531_LINK_10_100_ACTIVITY    6
+#define VSC8584_LINK_100FX_1000X_ACTIVITY	7
 #define VSC8531_DUPLEX_COLLISION        8
 #define VSC8531_COLLISION               9
 #define VSC8531_ACTIVITY                10
+#define VSC8584_100FX_1000X_ACTIVITY	11
 #define VSC8531_AUTONEG_FAULT           12
 #define VSC8531_SERIAL_MODE             13
 #define VSC8531_FORCE_LED_OFF           14
-- 
cgit v1.2.3


From 4a19edb60d0203cd5bf95a8b46ea8f63fd41194c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 7 Oct 2018 20:16:22 -0700
Subject: netlink: Pass extack to dump handlers

Declare extack in netlink_dump and pass to dump handlers via
netlink_callback. Add any extack message after the dump_done_errno
allowing error messages to be returned. This will be useful when
strict checking is done on dump requests, returning why the dump
fails EINVAL.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Christian Brauner <christian@brauner.io>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  |  1 +
 net/netlink/af_netlink.c | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 71f121b66ca8..88c8a2d83eb3 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -176,6 +176,7 @@ struct netlink_callback {
 	void			*data;
 	/* the module that dump function belong to */
 	struct module		*module;
+	struct netlink_ext_ack	*extack;
 	u16			family;
 	u16			min_dump_alloc;
 	unsigned int		prev_seq, seq;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e3a0538ec0be..7ac585f33a9e 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2171,6 +2171,7 @@ EXPORT_SYMBOL(__nlmsg_put);
 static int netlink_dump(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
+	struct netlink_ext_ack extack = {};
 	struct netlink_callback *cb;
 	struct sk_buff *skb = NULL;
 	struct nlmsghdr *nlh;
@@ -2222,8 +2223,11 @@ static int netlink_dump(struct sock *sk)
 	skb_reserve(skb, skb_tailroom(skb) - alloc_size);
 	netlink_skb_set_owner_r(skb, sk);
 
-	if (nlk->dump_done_errno > 0)
+	if (nlk->dump_done_errno > 0) {
+		cb->extack = &extack;
 		nlk->dump_done_errno = cb->dump(skb, cb);
+		cb->extack = NULL;
+	}
 
 	if (nlk->dump_done_errno > 0 ||
 	    skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) {
@@ -2246,6 +2250,12 @@ static int netlink_dump(struct sock *sk)
 	memcpy(nlmsg_data(nlh), &nlk->dump_done_errno,
 	       sizeof(nlk->dump_done_errno));
 
+	if (extack._msg && nlk->flags & NETLINK_F_EXT_ACK) {
+		nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
+		if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack._msg))
+			nlmsg_end(skb, nlh);
+	}
+
 	if (sk_filter(sk, skb))
 		kfree_skb(skb);
 	else
-- 
cgit v1.2.3


From 3d0d4337d7a105c5e8ba85c6e7b75437b4c6745e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 7 Oct 2018 20:16:23 -0700
Subject: netlink: Add extack message to nlmsg_parse for invalid header length

Give a user a reason why EINVAL is returned in nlmsg_parse.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Christian Brauner <christian@brauner.io>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 589683091f16..9522a0bf1f3a 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -516,8 +516,10 @@ static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
 			      const struct nla_policy *policy,
 			      struct netlink_ext_ack *extack)
 {
-	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
+	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) {
+		NL_SET_ERR_MSG(extack, "Invalid header length");
 		return -EINVAL;
+	}
 
 	return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
 			 nlmsg_attrlen(nlh, hdrlen), policy, extack);
-- 
cgit v1.2.3


From a5f6cba291654168e6ab73c3e7ff5b27371c4cb9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 7 Oct 2018 20:16:25 -0700
Subject: netlink: Add strict version of nlmsg_parse and nla_parse

nla_parse is currently lenient on message parsing, allowing type to be 0
or greater than max expected and only logging a message

    "netlink: %d bytes leftover after parsing attributes in process `%s'."

if the netlink message has unknown data at the end after parsing. What this
could mean is that the header at the front of the attributes is actually
wrong and the parsing is shifted from what is expected.

Add a new strict version that actually fails with EINVAL if there are any
bytes remaining after the parsing loop completes, if the atttrbitue type
is 0 or greater than max expected.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Christian Brauner <christian@brauner.io>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 17 +++++++++++++++++
 lib/nlattr.c          | 48 ++++++++++++++++++++++++++++++++++++------------
 2 files changed, 53 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 9522a0bf1f3a..f1db8e594847 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -373,6 +373,9 @@ int nla_validate(const struct nlattr *head, int len, int maxtype,
 int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
 	      int len, const struct nla_policy *policy,
 	      struct netlink_ext_ack *extack);
+int nla_parse_strict(struct nlattr **tb, int maxtype, const struct nlattr *head,
+		     int len, const struct nla_policy *policy,
+		     struct netlink_ext_ack *extack);
 int nla_policy_len(const struct nla_policy *, int);
 struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype);
 size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize);
@@ -525,6 +528,20 @@ static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
 			 nlmsg_attrlen(nlh, hdrlen), policy, extack);
 }
 
+static inline int nlmsg_parse_strict(const struct nlmsghdr *nlh, int hdrlen,
+				     struct nlattr *tb[], int maxtype,
+				     const struct nla_policy *policy,
+				     struct netlink_ext_ack *extack)
+{
+	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) {
+		NL_SET_ERR_MSG(extack, "Invalid header length");
+		return -EINVAL;
+	}
+
+	return nla_parse_strict(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
+				nlmsg_attrlen(nlh, hdrlen), policy, extack);
+}
+
 /**
  * nlmsg_find_attr - find a specific attribute in a netlink message
  * @nlh: netlink message header
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 1e900bb414ef..d26de6156b97 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -391,9 +391,10 @@ EXPORT_SYMBOL(nla_policy_len);
  *
  * Returns 0 on success or a negative error code.
  */
-int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
-	      int len, const struct nla_policy *policy,
-	      struct netlink_ext_ack *extack)
+static int __nla_parse(struct nlattr **tb, int maxtype,
+		       const struct nlattr *head, int len,
+		       bool strict, const struct nla_policy *policy,
+		       struct netlink_ext_ack *extack)
 {
 	const struct nlattr *nla;
 	int rem;
@@ -403,27 +404,50 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
 	nla_for_each_attr(nla, head, len, rem) {
 		u16 type = nla_type(nla);
 
-		if (type > 0 && type <= maxtype) {
-			if (policy) {
-				int err = validate_nla(nla, maxtype, policy,
-						       extack);
-
-				if (err < 0)
-					return err;
+		if (type == 0 || type > maxtype) {
+			if (strict) {
+				NL_SET_ERR_MSG(extack, "Unknown attribute type");
+				return -EINVAL;
 			}
+			continue;
+		}
+		if (policy) {
+			int err = validate_nla(nla, maxtype, policy, extack);
 
-			tb[type] = (struct nlattr *)nla;
+			if (err < 0)
+				return err;
 		}
+
+		tb[type] = (struct nlattr *)nla;
 	}
 
-	if (unlikely(rem > 0))
+	if (unlikely(rem > 0)) {
 		pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n",
 				    rem, current->comm);
+		NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes");
+		if (strict)
+			return -EINVAL;
+	}
 
 	return 0;
 }
+
+int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
+	      int len, const struct nla_policy *policy,
+	      struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, head, len, false, policy, extack);
+}
 EXPORT_SYMBOL(nla_parse);
 
+int nla_parse_strict(struct nlattr **tb, int maxtype, const struct nlattr *head,
+		     int len, const struct nla_policy *policy,
+		     struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, head, len, true, policy, extack);
+}
+EXPORT_SYMBOL(nla_parse_strict);
+
 /**
  * nla_find - Find a specific attribute in a stream of attributes
  * @head: head of attribute stream
-- 
cgit v1.2.3


From 89d35528d17d25819a755a2b52931e911baebc66 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 7 Oct 2018 20:16:27 -0700
Subject: netlink: Add new socket option to enable strict checking on dumps

Add a new socket option, NETLINK_DUMP_STRICT_CHK, that userspace
can use via setsockopt to request strict checking of headers and
attributes on dump requests.

To get dump features such as kernel side filtering based on data in
the header or attributes appended to the dump request, userspace
must call setsockopt() for NETLINK_DUMP_STRICT_CHK and a non-zero
value. Since the netlink sock and its flags are private to the
af_netlink code, the strict checking flag is passed to dump handlers
via a flag in the netlink_callback struct.

For old userspace on new kernel there is no impact as all of the data
checks in later patches are wrapped in a check on the new strict flag.

For new userspace on old kernel, the setsockopt will fail and even if
new userspace sets data in the headers and appended attributes the
kernel will silently ignore it. Moving forward when the setsockopt
succeeds, the new userspace on old kernel means the dump request can
pass an attribute the kernel does not understand. The dump will then
fail as the older kernel does not understand it.

New userspace on new kernel setting the socket option gets the benefit
of the improved data dump.

Kernel side the NETLINK_DUMP_STRICT_CHK uapi is converted to a generic
NETLINK_F_STRICT_CHK flag which can potentially be leveraged for tighter
checking on the NEW, DEL, and SET commands.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Christian Brauner <christian@brauner.io>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h      |  1 +
 include/uapi/linux/netlink.h |  1 +
 net/netlink/af_netlink.c     | 21 ++++++++++++++++++++-
 net/netlink/af_netlink.h     |  1 +
 4 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 88c8a2d83eb3..72580f1a72a2 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -179,6 +179,7 @@ struct netlink_callback {
 	struct netlink_ext_ack	*extack;
 	u16			family;
 	u16			min_dump_alloc;
+	bool			strict_check;
 	unsigned int		prev_seq, seq;
 	long			args[6];
 };
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 776bc92e9118..486ed1f0c0bc 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -155,6 +155,7 @@ enum nlmsgerr_attrs {
 #define NETLINK_LIST_MEMBERSHIPS	9
 #define NETLINK_CAP_ACK			10
 #define NETLINK_EXT_ACK			11
+#define NETLINK_DUMP_STRICT_CHK		12
 
 struct nl_pktinfo {
 	__u32	group;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 7ac585f33a9e..e613a9f89600 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1706,6 +1706,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 			nlk->flags &= ~NETLINK_F_EXT_ACK;
 		err = 0;
 		break;
+	case NETLINK_DUMP_STRICT_CHK:
+		if (val)
+			nlk->flags |= NETLINK_F_STRICT_CHK;
+		else
+			nlk->flags &= ~NETLINK_F_STRICT_CHK;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -1799,6 +1806,15 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 		err = 0;
 		break;
+	case NETLINK_DUMP_STRICT_CHK:
+		if (len < sizeof(int))
+			return -EINVAL;
+		len = sizeof(int);
+		val = nlk->flags & NETLINK_F_STRICT_CHK ? 1 : 0;
+		if (put_user(len, optlen) || put_user(val, optval))
+			return -EFAULT;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -2282,9 +2298,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 			 const struct nlmsghdr *nlh,
 			 struct netlink_dump_control *control)
 {
+	struct netlink_sock *nlk, *nlk2;
 	struct netlink_callback *cb;
 	struct sock *sk;
-	struct netlink_sock *nlk;
 	int ret;
 
 	refcount_inc(&skb->users);
@@ -2318,6 +2334,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	cb->min_dump_alloc = control->min_dump_alloc;
 	cb->skb = skb;
 
+	nlk2 = nlk_sk(NETLINK_CB(skb).sk);
+	cb->strict_check = !!(nlk2->flags & NETLINK_F_STRICT_CHK);
+
 	if (control->start) {
 		ret = control->start(cb);
 		if (ret)
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 962de7b3c023..5f454c8de6a4 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -15,6 +15,7 @@
 #define NETLINK_F_LISTEN_ALL_NSID	0x10
 #define NETLINK_F_CAP_ACK		0x20
 #define NETLINK_F_EXT_ACK		0x40
+#define NETLINK_F_STRICT_CHK		0x80
 
 #define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
 #define NLGRPLONGS(x)	(NLGRPSZ(x)/sizeof(unsigned long))
-- 
cgit v1.2.3


From e8ba330ac0c55004e775eab53fa1e748e5d71bdb Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 7 Oct 2018 20:16:35 -0700
Subject: rtnetlink: Update fib dumps for strict data checking

Add helper to check netlink message for route dumps. If the strict flag
is set the dump request is expected to have an rtmsg struct as the header.
All elements of the struct are expected to be 0 with the exception of
rtm_flags (which is used by both ipv4 and ipv6 dumps) and no attributes
can be appended. rtm_flags can only have RTM_F_CLONED and RTM_F_PREFIX
set.

Update inet_dump_fib, inet6_dump_fib, mpls_dump_routes, ipmr_rtm_dumproute,
and ip6mr_rtm_dumproute to call this helper if strict data checking is
enabled.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  2 ++
 net/ipv4/fib_frontend.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/ipmr.c         |  7 +++++++
 net/ipv6/ip6_fib.c      |  8 ++++++++
 net/ipv6/ip6mr.c        |  9 +++++++++
 net/mpls/af_mpls.c      |  8 ++++++++
 6 files changed, 74 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f7c109e37298..9846b79c9ee1 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -452,4 +452,6 @@ static inline void fib_proc_exit(struct net *net)
 
 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);
 
+int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack);
 #endif  /* _NET_FIB_H */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 30e2bcc3ef2a..038f511c73fa 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -802,8 +802,40 @@ errout:
 	return err;
 }
 
+int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct rtmsg *rtm;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+		NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
+		return -EINVAL;
+	}
+
+	rtm = nlmsg_data(nlh);
+	if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
+	    rtm->rtm_table   || rtm->rtm_protocol || rtm->rtm_scope ||
+	    rtm->rtm_type) {
+		NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
+		return -EINVAL;
+	}
+	if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
+		NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
+		return -EINVAL;
+	}
+
+	if (nlmsg_attrlen(nlh, sizeof(*rtm))) {
+		NL_SET_ERR_MSG(extack, "Invalid data after header in FIB dump request");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
+
 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
@@ -811,8 +843,14 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	struct hlist_head *head;
 	int dumped = 0, err;
 
-	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
-	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
+	if (cb->strict_check) {
+		err = ip_valid_fib_dump_req(nlh, cb->extack);
+		if (err < 0)
+			return err;
+	}
+
+	if (nlmsg_len(nlh) >= sizeof(struct rtmsg) &&
+	    ((struct rtmsg *)nlmsg_data(nlh))->rtm_flags & RTM_F_CLONED)
 		return skb->len;
 
 	s_h = cb->args[0];
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e7322e407bb4..91b0d5671649 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2527,6 +2527,13 @@ errout_free:
 
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	if (cb->strict_check) {
+		int err = ip_valid_fib_dump_req(cb->nlh, cb->extack);
+
+		if (err < 0)
+			return err;
+	}
+
 	return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
 				_ipmr_fill_mroute, &mfc_unres_lock);
 }
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index cf709eadc932..e14d244c551f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -564,6 +564,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 
 static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
@@ -573,6 +574,13 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	struct hlist_head *head;
 	int res = 0;
 
+	if (cb->strict_check) {
+		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+
+		if (err < 0)
+			return err;
+	}
+
 	s_h = cb->args[0];
 	s_e = cb->args[1];
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 6f07b8380425..d7563ef76518 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2457,6 +2457,15 @@ errout:
 
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	const struct nlmsghdr *nlh = cb->nlh;
+
+	if (cb->strict_check) {
+		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+
+		if (err < 0)
+			return err;
+	}
+
 	return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
 				_ip6mr_fill_mroute, &mfc_unres_lock);
 }
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 55a30ee3d820..0458c8aa5c11 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2017,6 +2017,7 @@ nla_put_failure:
 
 static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	struct mpls_route __rcu **platform_label;
 	size_t platform_labels;
@@ -2024,6 +2025,13 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 
 	ASSERT_RTNL();
 
+	if (cb->strict_check) {
+		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+
+		if (err < 0)
+			return err;
+	}
+
 	index = cb->args[0];
 	if (index < MPLS_LABEL_FIRST_UNRESERVED)
 		index = MPLS_LABEL_FIRST_UNRESERVED;
-- 
cgit v1.2.3


From f6a8a19bb11b46d60250ddc4e3e1ba6aa166f488 Mon Sep 17 00:00:00 2001
From: Denis Drozdov <denisd@mellanox.com>
Date: Tue, 14 Aug 2018 14:08:51 +0300
Subject: RDMA/netdev: Hoist alloc_netdev_mqs out of the driver

netdev has several interfaces that expect to call alloc_netdev_mqs from
the core code, with the driver only providing the arguments.  This is
incompatible with the rdma_netdev interface that returns the netdev
directly.

Thus re-organize the API used by ipoib so that the verbs core code calls
alloc_netdev_mqs for the driver. This is done by allowing the drivers to
provide the allocation parameters via a 'get_params' callback and then
initializing an allocated netdev as a second step.

Fixes: cd565b4b51e5 ("IB/IPoIB: Support acceleration options callbacks")
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Denis Drozdov <denisd@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/core/verbs.c                    | 32 ++++++++
 drivers/infiniband/hw/mlx5/main.c                  | 23 ++----
 drivers/infiniband/ulp/ipoib/ipoib_main.c          | 21 ++----
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  | 87 ++++++++++++----------
 include/linux/mlx5/driver.h                        | 14 +---
 include/rdma/ib_verbs.h                            | 23 +++++-
 6 files changed, 119 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 6ee03d6089eb..2f34d3412097 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -2621,3 +2621,35 @@ void ib_drain_qp(struct ib_qp *qp)
 		ib_drain_rq(qp);
 }
 EXPORT_SYMBOL(ib_drain_qp);
+
+struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num,
+				     enum rdma_netdev_t type, const char *name,
+				     unsigned char name_assign_type,
+				     void (*setup)(struct net_device *))
+{
+	struct rdma_netdev_alloc_params params;
+	struct net_device *netdev;
+	int rc;
+
+	if (!device->rdma_netdev_get_params)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	rc = device->rdma_netdev_get_params(device, port_num, type, &params);
+	if (rc)
+		return ERR_PTR(rc);
+
+	netdev = alloc_netdev_mqs(params.sizeof_priv, name, name_assign_type,
+				  setup, params.txqs, params.rxqs);
+	if (!netdev)
+		return ERR_PTR(-ENOMEM);
+
+	rc = params.initialize_rdma_netdev(device, port_num, netdev,
+					   params.param);
+	if (rc) {
+		free_netdev(netdev);
+		return ERR_PTR(rc);
+	}
+
+	return netdev;
+}
+EXPORT_SYMBOL(rdma_alloc_netdev);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index c414f3809e5c..5d9b7f62a0ba 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -5163,22 +5163,14 @@ done:
 	return num_counters;
 }
 
-static struct net_device*
-mlx5_ib_alloc_rdma_netdev(struct ib_device *hca,
-			  u8 port_num,
-			  enum rdma_netdev_t type,
-			  const char *name,
-			  unsigned char name_assign_type,
-			  void (*setup)(struct net_device *))
+static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
+				 enum rdma_netdev_t type,
+				 struct rdma_netdev_alloc_params *params)
 {
-	struct net_device *netdev;
-
 	if (type != RDMA_NETDEV_IPOIB)
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
-	netdev = mlx5_rdma_netdev_alloc(to_mdev(hca)->mdev, hca,
-					name, setup);
-	return netdev;
+	return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params);
 }
 
 static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev)
@@ -5824,8 +5816,9 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
 	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
 	dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
 	dev->ib_dev.get_vector_affinity	= mlx5_ib_get_vector_affinity;
-	if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads))
-		dev->ib_dev.alloc_rdma_netdev	= mlx5_ib_alloc_rdma_netdev;
+	if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
+	    IS_ENABLED(CONFIG_MLX5_CORE_IPOIB))
+		dev->ib_dev.rdma_netdev_get_params = mlx5_ib_rn_get_params;
 
 	if (mlx5_core_is_pf(mdev)) {
 		dev->ib_dev.get_vf_config	= mlx5_ib_get_vf_config;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index e3d28f9ad9c0..9c816cd41724 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -2146,20 +2146,15 @@ static struct net_device *ipoib_get_netdev(struct ib_device *hca, u8 port,
 {
 	struct net_device *dev;
 
-	if (hca->alloc_rdma_netdev) {
-		dev = hca->alloc_rdma_netdev(hca, port,
-					     RDMA_NETDEV_IPOIB, name,
-					     NET_NAME_UNKNOWN,
-					     ipoib_setup_common);
-		if (IS_ERR_OR_NULL(dev) && PTR_ERR(dev) != -EOPNOTSUPP)
-			return NULL;
-	}
-
-	if (!hca->alloc_rdma_netdev || PTR_ERR(dev) == -EOPNOTSUPP)
-		dev = ipoib_create_netdev_default(hca, name, NET_NAME_UNKNOWN,
-						  ipoib_setup_common);
+	dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
+				NET_NAME_UNKNOWN, ipoib_setup_common);
+	if (!IS_ERR(dev))
+		return dev;
+	if (PTR_ERR(dev) != -EOPNOTSUPP)
+		return NULL;
 
-	return dev;
+	return ipoib_create_netdev_default(hca, name, NET_NAME_UNKNOWN,
+					   ipoib_setup_common);
 }
 
 struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 299e2a897f7e..af1a95f80404 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -658,53 +658,36 @@ static void mlx5_rdma_netdev_free(struct net_device *netdev)
 	}
 }
 
-struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
-					  struct ib_device *ibdev,
-					  const char *name,
-					  void (*setup)(struct net_device *))
+static bool mlx5_is_sub_interface(struct mlx5_core_dev *mdev)
 {
-	const struct mlx5e_profile *profile;
-	struct net_device *netdev;
+	return mdev->mlx5e_res.pdn != 0;
+}
+
+static const struct mlx5e_profile *mlx5_get_profile(struct mlx5_core_dev *mdev)
+{
+	if (mlx5_is_sub_interface(mdev))
+		return mlx5i_pkey_get_profile();
+	return &mlx5i_nic_profile;
+}
+
+static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u8 port_num,
+			      struct net_device *netdev, void *param)
+{
+	struct mlx5_core_dev *mdev = (struct mlx5_core_dev *)param;
+	const struct mlx5e_profile *prof = mlx5_get_profile(mdev);
 	struct mlx5i_priv *ipriv;
 	struct mlx5e_priv *epriv;
 	struct rdma_netdev *rn;
-	bool sub_interface;
-	int nch;
 	int err;
 
-	if (mlx5i_check_required_hca_cap(mdev)) {
-		mlx5_core_warn(mdev, "Accelerated mode is not supported\n");
-		return ERR_PTR(-EOPNOTSUPP);
-	}
-
-	/* TODO: Need to find a better way to check if child device*/
-	sub_interface = (mdev->mlx5e_res.pdn != 0);
-
-	if (sub_interface)
-		profile = mlx5i_pkey_get_profile();
-	else
-		profile = &mlx5i_nic_profile;
-
-	nch = profile->max_nch(mdev);
-
-	netdev = alloc_netdev_mqs(sizeof(struct mlx5i_priv) + sizeof(struct mlx5e_priv),
-				  name, NET_NAME_UNKNOWN,
-				  setup,
-				  nch * MLX5E_MAX_NUM_TC,
-				  nch);
-	if (!netdev) {
-		mlx5_core_warn(mdev, "alloc_netdev_mqs failed\n");
-		return NULL;
-	}
-
 	ipriv = netdev_priv(netdev);
 	epriv = mlx5i_epriv(netdev);
 
 	epriv->wq = create_singlethread_workqueue("mlx5i");
 	if (!epriv->wq)
-		goto err_free_netdev;
+		return -ENOMEM;
 
-	ipriv->sub_interface = sub_interface;
+	ipriv->sub_interface = mlx5_is_sub_interface(mdev);
 	if (!ipriv->sub_interface) {
 		err = mlx5i_pkey_qpn_ht_init(netdev);
 		if (err) {
@@ -718,7 +701,7 @@ struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
 			goto destroy_ht;
 	}
 
-	profile->init(mdev, netdev, profile, ipriv);
+	prof->init(mdev, netdev, prof, ipriv);
 
 	mlx5e_attach_netdev(epriv);
 	netif_carrier_off(netdev);
@@ -734,15 +717,37 @@ struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
 	netdev->priv_destructor = mlx5_rdma_netdev_free;
 	netdev->needs_free_netdev = 1;
 
-	return netdev;
+	return 0;
 
 destroy_ht:
 	mlx5i_pkey_qpn_ht_cleanup(netdev);
 destroy_wq:
 	destroy_workqueue(epriv->wq);
-err_free_netdev:
-	free_netdev(netdev);
+	return err;
+}
+
+int mlx5_rdma_rn_get_params(struct mlx5_core_dev *mdev,
+			    struct ib_device *device,
+			    struct rdma_netdev_alloc_params *params)
+{
+	int nch;
+	int rc;
+
+	rc = mlx5i_check_required_hca_cap(mdev);
+	if (rc)
+		return rc;
 
-	return NULL;
+	nch = mlx5_get_profile(mdev)->max_nch(mdev);
+
+	*params = (struct rdma_netdev_alloc_params){
+		.sizeof_priv = sizeof(struct mlx5i_priv) +
+			       sizeof(struct mlx5e_priv),
+		.txqs = nch * MLX5E_MAX_NUM_TC,
+		.rxqs = nch,
+		.param = mdev,
+		.initialize_rdma_netdev = mlx5_rdma_setup_rn,
+	};
+
+	return 0;
 }
-EXPORT_SYMBOL(mlx5_rdma_netdev_alloc);
+EXPORT_SYMBOL(mlx5_rdma_rn_get_params);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 26a92462f4ce..4b75796cac23 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1228,21 +1228,15 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
 struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev);
 void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up);
 
-#ifndef CONFIG_MLX5_CORE_IPOIB
-static inline
-struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
-					  struct ib_device *ibdev,
-					  const char *name,
-					  void (*setup)(struct net_device *))
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-#else
+#ifdef CONFIG_MLX5_CORE_IPOIB
 struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
 					  struct ib_device *ibdev,
 					  const char *name,
 					  void (*setup)(struct net_device *));
 #endif /* CONFIG_MLX5_CORE_IPOIB */
+int mlx5_rdma_rn_get_params(struct mlx5_core_dev *mdev,
+			    struct ib_device *device,
+			    struct rdma_netdev_alloc_params *params);
 
 struct mlx5_profile {
 	u64	mask;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e950c2a68f06..020216cee8f1 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2223,6 +2223,16 @@ struct rdma_netdev {
 			    union ib_gid *gid, u16 mlid);
 };
 
+struct rdma_netdev_alloc_params {
+	size_t sizeof_priv;
+	unsigned int txqs;
+	unsigned int rxqs;
+	void *param;
+
+	int (*initialize_rdma_netdev)(struct ib_device *device, u8 port_num,
+				      struct net_device *netdev, void *param);
+};
+
 struct ib_port_pkey_list {
 	/* Lock to hold while modifying the list. */
 	spinlock_t                    list_lock;
@@ -2523,8 +2533,8 @@ struct ib_device {
 	/**
 	 * rdma netdev operation
 	 *
-	 * Driver implementing alloc_rdma_netdev must return -EOPNOTSUPP if it
-	 * doesn't support the specified rdma netdev type.
+	 * Driver implementing alloc_rdma_netdev or rdma_netdev_get_params
+	 * must return -EOPNOTSUPP if it doesn't support the specified type.
 	 */
 	struct net_device *(*alloc_rdma_netdev)(
 					struct ib_device *device,
@@ -2534,6 +2544,10 @@ struct ib_device {
 					unsigned char name_assign_type,
 					void (*setup)(struct net_device *));
 
+	int (*rdma_netdev_get_params)(struct ib_device *device, u8 port_num,
+				      enum rdma_netdev_t type,
+				      struct rdma_netdev_alloc_params *params);
+
 	struct module               *owner;
 	struct device                dev;
 	struct kobject               *ports_parent;
@@ -4179,4 +4193,9 @@ struct ib_ucontext *ib_uverbs_get_ucontext(struct ib_uverbs_file *ufile);
 
 int uverbs_destroy_def_handler(struct ib_uverbs_file *file,
 			       struct uverbs_attr_bundle *attrs);
+
+struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num,
+				     enum rdma_netdev_t type, const char *name,
+				     unsigned char name_assign_type,
+				     void (*setup)(struct net_device *));
 #endif /* IB_VERBS_H */
-- 
cgit v1.2.3


From 5d6b0cb3369df425de75c94c98eb3f1a86659022 Mon Sep 17 00:00:00 2001
From: Denis Drozdov <denisd@mellanox.com>
Date: Tue, 14 Aug 2018 14:22:35 +0300
Subject: RDMA/netdev: Fix netlink support in IPoIB

IPoIB netlink support was broken by the below commit since integrating
the rdma_netdev support relies on an allocation flow for netdevs that
was controlled by the ipoib driver while netdev's rtnl_newlink
implementation assumes that the netdev will be allocated by netlink.
Such situation leads to crash in __ipoib_device_add, once trying to
reuse netlink device.

This patch fixes the kernel oops for both mlx4 and mlx5
devices triggered by the following command:

Fixes: cd565b4b51e5 ("IB/IPoIB: Support acceleration options callbacks")
Signed-off-by: Denis Drozdov <denisd@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/core/verbs.c              |  28 ++++--
 drivers/infiniband/ulp/ipoib/ipoib.h         |   8 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c    | 123 +++++++++++++++------------
 drivers/infiniband/ulp/ipoib/ipoib_netlink.c |  23 ++++-
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c    |  19 +++--
 include/rdma/ib_verbs.h                      |   7 ++
 6 files changed, 136 insertions(+), 72 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 2f34d3412097..8ec7418e99f0 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -2643,13 +2643,27 @@ struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num,
 	if (!netdev)
 		return ERR_PTR(-ENOMEM);
 
-	rc = params.initialize_rdma_netdev(device, port_num, netdev,
-					   params.param);
-	if (rc) {
-		free_netdev(netdev);
-		return ERR_PTR(rc);
-	}
-
 	return netdev;
 }
 EXPORT_SYMBOL(rdma_alloc_netdev);
+
+int rdma_init_netdev(struct ib_device *device, u8 port_num,
+		     enum rdma_netdev_t type, const char *name,
+		     unsigned char name_assign_type,
+		     void (*setup)(struct net_device *),
+		     struct net_device *netdev)
+{
+	struct rdma_netdev_alloc_params params;
+	int rc;
+
+	if (!device->rdma_netdev_get_params)
+		return -EOPNOTSUPP;
+
+	rc = device->rdma_netdev_get_params(device, port_num, type, &params);
+	if (rc)
+		return rc;
+
+	return params.initialize_rdma_netdev(device, port_num,
+					     netdev, params.param);
+}
+EXPORT_SYMBOL(rdma_init_netdev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 1abe3c62f106..1da119d901a9 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -499,8 +499,10 @@ void ipoib_reap_ah(struct work_struct *work);
 struct ipoib_path *__path_find(struct net_device *dev, void *gid);
 void ipoib_mark_paths_invalid(struct net_device *dev);
 void ipoib_flush_paths(struct net_device *dev);
-struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port,
-					const char *format);
+struct net_device *ipoib_intf_alloc(struct ib_device *hca, u8 port,
+				    const char *format);
+int ipoib_intf_init(struct ib_device *hca, u8 port, const char *format,
+		    struct net_device *dev);
 void ipoib_ib_tx_timer_func(struct timer_list *t);
 void ipoib_ib_dev_flush_light(struct work_struct *work);
 void ipoib_ib_dev_flush_normal(struct work_struct *work);
@@ -531,6 +533,8 @@ int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
 void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
 			struct ipoib_tx_buf *tx_req);
 
+struct rtnl_link_ops *ipoib_get_link_ops(void);
+
 static inline void ipoib_build_sge(struct ipoib_dev_priv *priv,
 				   struct ipoib_tx_buf *tx_req)
 {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 9c816cd41724..8baa75a705c5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -2115,77 +2115,58 @@ static const struct net_device_ops ipoib_netdev_default_pf = {
 	.ndo_stop		 = ipoib_ib_dev_stop_default,
 };
 
-static struct net_device
-*ipoib_create_netdev_default(struct ib_device *hca,
-			     const char *name,
-			     unsigned char name_assign_type,
-			     void (*setup)(struct net_device *))
-{
-	struct net_device *dev;
-	struct rdma_netdev *rn;
-
-	dev = alloc_netdev((int)sizeof(struct rdma_netdev),
-			   name,
-			   name_assign_type, setup);
-	if (!dev)
-		return NULL;
-
-	rn = netdev_priv(dev);
-
-	rn->send = ipoib_send;
-	rn->attach_mcast = ipoib_mcast_attach;
-	rn->detach_mcast = ipoib_mcast_detach;
-	rn->hca = hca;
-	dev->netdev_ops = &ipoib_netdev_default_pf;
-
-	return dev;
-}
-
-static struct net_device *ipoib_get_netdev(struct ib_device *hca, u8 port,
-					   const char *name)
+static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u8 port,
+					     const char *name)
 {
 	struct net_device *dev;
 
 	dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
 				NET_NAME_UNKNOWN, ipoib_setup_common);
-	if (!IS_ERR(dev))
+	if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP)
 		return dev;
-	if (PTR_ERR(dev) != -EOPNOTSUPP)
-		return NULL;
 
-	return ipoib_create_netdev_default(hca, name, NET_NAME_UNKNOWN,
-					   ipoib_setup_common);
+	dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN,
+			   ipoib_setup_common);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+	return dev;
 }
 
-struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port,
-					const char *name)
+int ipoib_intf_init(struct ib_device *hca, u8 port, const char *name,
+		    struct net_device *dev)
 {
-	struct net_device *dev;
+	struct rdma_netdev *rn = netdev_priv(dev);
 	struct ipoib_dev_priv *priv;
-	struct rdma_netdev *rn;
+	int rc;
 
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (!priv)
-		return NULL;
+		return -ENOMEM;
 
 	priv->ca = hca;
 	priv->port = port;
 
-	dev = ipoib_get_netdev(hca, port, name);
-	if (!dev)
-		goto free_priv;
+	rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
+			      NET_NAME_UNKNOWN, ipoib_setup_common, dev);
+	if (rc) {
+		if (rc != -EOPNOTSUPP)
+			goto out;
+
+		dev->netdev_ops = &ipoib_netdev_default_pf;
+		rn->send = ipoib_send;
+		rn->attach_mcast = ipoib_mcast_attach;
+		rn->detach_mcast = ipoib_mcast_detach;
+		rn->hca = hca;
+	}
 
 	priv->rn_ops = dev->netdev_ops;
 
-	/* fixme : should be after the query_cap */
-	if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION)
+	if (hca->attrs.device_cap_flags & IB_DEVICE_VIRTUAL_FUNCTION)
 		dev->netdev_ops	= &ipoib_netdev_ops_vf;
 	else
 		dev->netdev_ops	= &ipoib_netdev_ops_pf;
 
-	rn = netdev_priv(dev);
 	rn->clnt_priv = priv;
-
 	/*
 	 * Only the child register_netdev flows can handle priv_destructor
 	 * being set, so we force it to NULL here and handle manually until it
@@ -2196,10 +2177,35 @@ struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port,
 
 	ipoib_build_priv(dev);
 
-	return priv;
-free_priv:
+	return 0;
+
+out:
 	kfree(priv);
-	return NULL;
+	return rc;
+}
+
+struct net_device *ipoib_intf_alloc(struct ib_device *hca, u8 port,
+				    const char *name)
+{
+	struct net_device *dev;
+	int rc;
+
+	dev = ipoib_alloc_netdev(hca, port, name);
+	if (IS_ERR(dev))
+		return dev;
+
+	rc = ipoib_intf_init(hca, port, name, dev);
+	if (rc) {
+		free_netdev(dev);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * Upon success the caller must ensure ipoib_intf_free is called or
+	 * register_netdevice succeed'd and priv_destructor is set to
+	 * ipoib_intf_free.
+	 */
+	return dev;
 }
 
 void ipoib_intf_free(struct net_device *dev)
@@ -2382,16 +2388,19 @@ int ipoib_add_pkey_attr(struct net_device *dev)
 static struct net_device *ipoib_add_port(const char *format,
 					 struct ib_device *hca, u8 port)
 {
+	struct rtnl_link_ops *ops = ipoib_get_link_ops();
+	struct rdma_netdev_alloc_params params;
 	struct ipoib_dev_priv *priv;
 	struct net_device *ndev;
 	int result;
 
-	priv = ipoib_intf_alloc(hca, port, format);
-	if (!priv) {
-		pr_warn("%s, %d: ipoib_intf_alloc failed\n", hca->name, port);
-		return ERR_PTR(-ENOMEM);
+	ndev = ipoib_intf_alloc(hca, port, format);
+	if (IS_ERR(ndev)) {
+		pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port,
+			PTR_ERR(ndev));
+		return ndev;
 	}
-	ndev = priv->dev;
+	priv = ipoib_priv(ndev);
 
 	INIT_IB_EVENT_HANDLER(&priv->event_handler,
 			      priv->ca, ipoib_event);
@@ -2412,6 +2421,14 @@ static struct net_device *ipoib_add_port(const char *format,
 		return ERR_PTR(result);
 	}
 
+	if (hca->rdma_netdev_get_params) {
+		int rc = hca->rdma_netdev_get_params(hca, port,
+						     RDMA_NETDEV_IPOIB,
+						     &params);
+
+		if (!rc && ops->priv_size < params.sizeof_priv)
+			ops->priv_size = params.sizeof_priv;
+	}
 	/*
 	 * We cannot set priv_destructor before register_netdev because we
 	 * need priv to be always valid during the error flow to execute
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
index d4d553a51fa9..38c984d16996 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
@@ -122,12 +122,26 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev,
 	} else
 		child_pkey  = nla_get_u16(data[IFLA_IPOIB_PKEY]);
 
+	err = ipoib_intf_init(ppriv->ca, ppriv->port, dev->name, dev);
+	if (err) {
+		ipoib_warn(ppriv, "failed to initialize pkey device\n");
+		return err;
+	}
+
 	err = __ipoib_vlan_add(ppriv, ipoib_priv(dev),
 			       child_pkey, IPOIB_RTNL_CHILD);
+	if (err)
+		return err;
 
-	if (!err && data)
+	if (data) {
 		err = ipoib_changelink(dev, tb, data, extack);
-	return err;
+		if (err) {
+			unregister_netdevice(dev);
+			return err;
+		}
+	}
+
+	return 0;
 }
 
 static size_t ipoib_get_size(const struct net_device *dev)
@@ -149,6 +163,11 @@ static struct rtnl_link_ops ipoib_link_ops __read_mostly = {
 	.fill_info	= ipoib_fill_info,
 };
 
+struct rtnl_link_ops *ipoib_get_link_ops(void)
+{
+	return &ipoib_link_ops;
+}
+
 int __init ipoib_netlink_init(void)
 {
 	return rtnl_link_register(&ipoib_link_ops);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index 341753fbda54..8ac8e18fbe0c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -85,7 +85,7 @@ static bool is_child_unique(struct ipoib_dev_priv *ppriv,
 
 /*
  * NOTE: If this function fails then the priv->dev will remain valid, however
- * priv can have been freed and must not be touched by caller in the error
+ * priv will have been freed and must not be touched by caller in the error
  * case.
  *
  * If (ndev->reg_state == NETREG_UNINITIALIZED) then it is up to the caller to
@@ -100,6 +100,12 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
 
 	ASSERT_RTNL();
 
+	/*
+	 * We do not need to touch priv if register_netdevice fails, so just
+	 * always use this flow.
+	 */
+	ndev->priv_destructor = ipoib_intf_free;
+
 	/*
 	 * Racing with unregister of the parent must be prevented by the
 	 * caller.
@@ -120,9 +126,6 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
 		goto out_early;
 	}
 
-	/* We do not need to touch priv if register_netdevice fails */
-	ndev->priv_destructor = ipoib_intf_free;
-
 	result = register_netdevice(ndev);
 	if (result) {
 		ipoib_warn(priv, "failed to initialize; error %i", result);
@@ -182,12 +185,12 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
 	snprintf(intf_name, sizeof(intf_name), "%s.%04x",
 		 ppriv->dev->name, pkey);
 
-	priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name);
-	if (!priv) {
-		result = -ENOMEM;
+	ndev = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name);
+	if (IS_ERR(ndev)) {
+		result = PTR_ERR(ndev);
 		goto out;
 	}
-	ndev = priv->dev;
+	priv = ipoib_priv(ndev);
 
 	result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD);
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 020216cee8f1..0ed5d913a492 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4198,4 +4198,11 @@ struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num,
 				     enum rdma_netdev_t type, const char *name,
 				     unsigned char name_assign_type,
 				     void (*setup)(struct net_device *));
+
+int rdma_init_netdev(struct ib_device *device, u8 port_num,
+		     enum rdma_netdev_t type, const char *name,
+		     unsigned char name_assign_type,
+		     void (*setup)(struct net_device *),
+		     struct net_device *netdev);
+
 #endif /* IB_VERBS_H */
-- 
cgit v1.2.3


From ed792e28c4bd09e9a319d2ad914aa62982cb4c4a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 8 Oct 2018 14:06:34 -0700
Subject: net/ipv6: Make ipv6_route_table_template static

ipv6_route_table_template is exported but there are no users outside
of route.c. Make it static.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 2 --
 net/ipv6/route.c   | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index ff33f498c137..829650540780 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1089,8 +1089,6 @@ static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; }
 #endif
 
 #ifdef CONFIG_SYSCTL
-extern struct ctl_table ipv6_route_table_template[];
-
 struct ctl_table *ipv6_icmp_sysctl_init(struct net *net);
 struct ctl_table *ipv6_route_sysctl_init(struct net *net);
 int ipv6_sysctl_register(void);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7c38e0e058ae..bf4cd647d8b8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5031,7 +5031,7 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
 	return 0;
 }
 
-struct ctl_table ipv6_route_table_template[] = {
+static struct ctl_table ipv6_route_table_template[] = {
 	{
 		.procname	=	"flush",
 		.data		=	&init_net.ipv6.sysctl.flush_delay,
-- 
cgit v1.2.3


From f458e832ba510f843807fc2c2906a8fb59554c9f Mon Sep 17 00:00:00 2001
From: Chaitanya T K <chaitanya.mgit@gmail.com>
Date: Sat, 6 Oct 2018 19:34:59 +0200
Subject: mac80211: minstrel: Enable STBC and LDPC for VHT Rates

If peer support reception of STBC and LDPC, enable them for better
performance.

Signed-off-by: Chaitanya TK <chaitanya.mgit@gmail.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h          |  1 +
 net/mac80211/rc80211_minstrel_ht.c | 23 +++++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index c4809ad8ab46..0ef67f837ae1 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1670,6 +1670,7 @@ struct ieee80211_mu_edca_param_set {
 #define IEEE80211_VHT_CAP_RXSTBC_3				0x00000300
 #define IEEE80211_VHT_CAP_RXSTBC_4				0x00000400
 #define IEEE80211_VHT_CAP_RXSTBC_MASK				0x00000700
+#define IEEE80211_VHT_CAP_RXSTBC_SHIFT				8
 #define IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE			0x00000800
 #define IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE			0x00001000
 #define IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT                  13
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 67ebdeaffbbc..16d1ac30978d 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -1130,7 +1130,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 	struct minstrel_ht_sta_priv *msp = priv_sta;
 	struct minstrel_ht_sta *mi = &msp->ht;
 	struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs;
-	u16 sta_cap = sta->ht_cap.cap;
+	u16 ht_cap = sta->ht_cap.cap;
 	struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
 	struct sta_info *sinfo = container_of(sta, struct sta_info, sta);
 	int use_vht;
@@ -1138,6 +1138,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 	int ack_dur;
 	int stbc;
 	int i;
+	bool ldpc;
 
 	/* fall back to the old minstrel for legacy stations */
 	if (!sta->ht_cap.ht_supported)
@@ -1175,16 +1176,22 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 	}
 	mi->sample_tries = 4;
 
-	/* TODO tx_flags for vht - ATM the RC API is not fine-grained enough */
 	if (!use_vht) {
-		stbc = (sta_cap & IEEE80211_HT_CAP_RX_STBC) >>
+		stbc = (ht_cap & IEEE80211_HT_CAP_RX_STBC) >>
 			IEEE80211_HT_CAP_RX_STBC_SHIFT;
-		mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT;
 
-		if (sta_cap & IEEE80211_HT_CAP_LDPC_CODING)
-			mi->tx_flags |= IEEE80211_TX_CTL_LDPC;
+		ldpc = ht_cap & IEEE80211_HT_CAP_LDPC_CODING;
+	} else {
+		stbc = (vht_cap->cap & IEEE80211_VHT_CAP_RXSTBC_MASK) >>
+			IEEE80211_VHT_CAP_RXSTBC_SHIFT;
+
+		ldpc = vht_cap->cap & IEEE80211_VHT_CAP_RXLDPC;
 	}
 
+	mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT;
+	if (ldpc)
+		mi->tx_flags |= IEEE80211_TX_CTL_LDPC;
+
 	for (i = 0; i < ARRAY_SIZE(mi->groups); i++) {
 		u32 gflags = minstrel_mcs_groups[i].flags;
 		int bw, nss;
@@ -1197,10 +1204,10 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
 
 		if (gflags & IEEE80211_TX_RC_SHORT_GI) {
 			if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) {
-				if (!(sta_cap & IEEE80211_HT_CAP_SGI_40))
+				if (!(ht_cap & IEEE80211_HT_CAP_SGI_40))
 					continue;
 			} else {
-				if (!(sta_cap & IEEE80211_HT_CAP_SGI_20))
+				if (!(ht_cap & IEEE80211_HT_CAP_SGI_20))
 					continue;
 			}
 		}
-- 
cgit v1.2.3


From bc847970f43281cb07c9f7d0897ee08cd1e08cf3 Mon Sep 17 00:00:00 2001
From: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Date: Wed, 3 Oct 2018 20:19:20 -0700
Subject: mac80211: support FTM responder configuration/statistics

New bss param ftm_responder is used to notify the driver to
enable fine timing request (FTM) responder role in AP mode.

Plumb the new cfg80211 API for FTM responder statistics through to
the driver API in mac80211.

Signed-off-by: David Spinadel <david.spinadel@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h    | 28 ++++++++++++++++
 net/mac80211/cfg.c        | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/driver-ops.h | 16 +++++++++
 net/mac80211/trace.h      | 23 +++++++++++++
 net/mac80211/util.c       |  5 +++
 5 files changed, 157 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c4fadbafbf21..2ccd4d1bef89 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -309,6 +309,8 @@ struct ieee80211_vif_chanctx_switch {
  * @BSS_CHANGED_KEEP_ALIVE: keep alive options (idle period or protected
  *	keep alive) changed.
  * @BSS_CHANGED_MCAST_RATE: Multicast Rate setting changed for this interface
+ * @BSS_CHANGED_FTM_RESPONDER: fime timing reasurement request responder
+ *	functionality changed for this BSS (AP mode).
  *
  */
 enum ieee80211_bss_change {
@@ -338,6 +340,7 @@ enum ieee80211_bss_change {
 	BSS_CHANGED_MU_GROUPS		= 1<<23,
 	BSS_CHANGED_KEEP_ALIVE		= 1<<24,
 	BSS_CHANGED_MCAST_RATE		= 1<<25,
+	BSS_CHANGED_FTM_RESPONDER	= 1<<26,
 
 	/* when adding here, make sure to change ieee80211_reconfig */
 };
@@ -463,6 +466,21 @@ struct ieee80211_mu_group_data {
 	u8 position[WLAN_USER_POSITION_LEN];
 };
 
+/**
+ * ieee80211_ftm_responder_params - FTM responder parameters
+ *
+ * @lci: LCI subelement content
+ * @civicloc: CIVIC location subelement content
+ * @lci_len: LCI data length
+ * @civicloc_len: Civic data length
+ */
+struct ieee80211_ftm_responder_params {
+	const u8 *lci;
+	const u8 *civicloc;
+	size_t lci_len;
+	size_t civicloc_len;
+};
+
 /**
  * struct ieee80211_bss_conf - holds the BSS's changing parameters
  *
@@ -562,6 +580,9 @@ struct ieee80211_mu_group_data {
  * @protected_keep_alive: if set, indicates that the station should send an RSN
  *	protected frame to the AP to reset the idle timer at the AP for the
  *	station.
+ * @ftm_responder: whether to enable or disable fine timing measurement FTM
+ *	responder functionality.
+ * @ftmr_params: configurable lci/civic parameter when enabling FTM responder.
  */
 struct ieee80211_bss_conf {
 	const u8 *bssid;
@@ -612,6 +633,8 @@ struct ieee80211_bss_conf {
 	bool allow_p2p_go_ps;
 	u16 max_idle_period;
 	bool protected_keep_alive;
+	bool ftm_responder;
+	struct ieee80211_ftm_responder_params *ftmr_params;
 };
 
 /**
@@ -3598,6 +3621,8 @@ enum ieee80211_reconfig_type {
  *	aggregating two specific frames in the same A-MSDU. The relation
  *	between the skbs should be symmetric and transitive. Note that while
  *	skb is always a real frame, head may or may not be an A-MSDU.
+ * @get_ftm_responder_stats: Retrieve FTM responder statistics, if available.
+ *	Statistics should be cumulative, currently no way to reset is provided.
  */
 struct ieee80211_ops {
 	void (*tx)(struct ieee80211_hw *hw,
@@ -3883,6 +3908,9 @@ struct ieee80211_ops {
 	bool (*can_aggregate_in_amsdu)(struct ieee80211_hw *hw,
 				       struct sk_buff *head,
 				       struct sk_buff *skb);
+	int (*get_ftm_responder_stats)(struct ieee80211_hw *hw,
+				       struct ieee80211_vif *vif,
+				       struct cfg80211_ftm_responder_stats *ftm_stats);
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 914aef7e7afd..51622333d460 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -790,6 +790,48 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
+static int ieee80211_set_ftm_responder_params(
+				struct ieee80211_sub_if_data *sdata,
+				const u8 *lci, size_t lci_len,
+				const u8 *civicloc, size_t civicloc_len)
+{
+	struct ieee80211_ftm_responder_params *new, *old;
+	struct ieee80211_bss_conf *bss_conf;
+	u8 *pos;
+	int len;
+
+	if ((!lci || !lci_len) && (!civicloc || !civicloc_len))
+		return 1;
+
+	bss_conf = &sdata->vif.bss_conf;
+	old = bss_conf->ftmr_params;
+	len = lci_len + civicloc_len;
+
+	new = kzalloc(sizeof(*new) + len, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	pos = (u8 *)(new + 1);
+	if (lci_len) {
+		new->lci_len = lci_len;
+		new->lci = pos;
+		memcpy(pos, lci, lci_len);
+		pos += lci_len;
+	}
+
+	if (civicloc_len) {
+		new->civicloc_len = civicloc_len;
+		new->civicloc = pos;
+		memcpy(pos, civicloc, civicloc_len);
+		pos += civicloc_len;
+	}
+
+	bss_conf->ftmr_params = new;
+	kfree(old);
+
+	return 0;
+}
+
 static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
 				   struct cfg80211_beacon_data *params,
 				   const struct ieee80211_csa_settings *csa)
@@ -863,6 +905,20 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
 	if (err == 0)
 		changed |= BSS_CHANGED_AP_PROBE_RESP;
 
+	if (params->ftm_responder != -1) {
+		sdata->vif.bss_conf.ftm_responder = params->ftm_responder;
+		err = ieee80211_set_ftm_responder_params(sdata,
+							 params->lci,
+							 params->lci_len,
+							 params->civicloc,
+							 params->civicloc_len);
+
+		if (err < 0)
+			return err;
+
+		changed |= BSS_CHANGED_FTM_RESPONDER;
+	}
+
 	rcu_assign_pointer(sdata->u.ap.beacon, new);
 
 	if (old)
@@ -1063,6 +1119,9 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
 		kfree_rcu(old_probe_resp, rcu_head);
 	sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF;
 
+	kfree(sdata->vif.bss_conf.ftmr_params);
+	sdata->vif.bss_conf.ftmr_params = NULL;
+
 	__sta_info_flush(sdata, true);
 	ieee80211_free_keys(sdata, true);
 
@@ -2875,6 +2934,20 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
 		memcpy(pos, beacon->probe_resp, beacon->probe_resp_len);
 		pos += beacon->probe_resp_len;
 	}
+	if (beacon->ftm_responder)
+		new_beacon->ftm_responder = beacon->ftm_responder;
+	if (beacon->lci) {
+		new_beacon->lci_len = beacon->lci_len;
+		new_beacon->lci = pos;
+		memcpy(pos, beacon->lci, beacon->lci_len);
+		pos += beacon->lci_len;
+	}
+	if (beacon->civicloc) {
+		new_beacon->civicloc_len = beacon->civicloc_len;
+		new_beacon->civicloc = pos;
+		memcpy(pos, beacon->civicloc, beacon->civicloc_len);
+		pos += beacon->civicloc_len;
+	}
 
 	return new_beacon;
 }
@@ -3765,6 +3838,17 @@ out:
 	return ret;
 }
 
+static int
+ieee80211_get_ftm_responder_stats(struct wiphy *wiphy,
+				  struct net_device *dev,
+				  struct cfg80211_ftm_responder_stats *ftm_stats)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	return drv_get_ftm_responder_stats(local, sdata, ftm_stats);
+}
+
 const struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -3859,4 +3943,5 @@ const struct cfg80211_ops mac80211_config_ops = {
 	.set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
 	.tx_control_port = ieee80211_tx_control_port,
 	.get_txq_stats = ieee80211_get_txq_stats,
+	.get_ftm_responder_stats = ieee80211_get_ftm_responder_stats,
 };
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index e42c641b6190..0b1747a2313d 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1183,6 +1183,22 @@ static inline int drv_can_aggregate_in_amsdu(struct ieee80211_local *local,
 	return local->ops->can_aggregate_in_amsdu(&local->hw, head, skb);
 }
 
+static inline int
+drv_get_ftm_responder_stats(struct ieee80211_local *local,
+			    struct ieee80211_sub_if_data *sdata,
+			    struct cfg80211_ftm_responder_stats *ftm_stats)
+{
+	u32 ret = -EOPNOTSUPP;
+
+	if (local->ops->get_ftm_responder_stats)
+		ret = local->ops->get_ftm_responder_stats(&local->hw,
+							 &sdata->vif,
+							 ftm_stats);
+	trace_drv_get_ftm_responder_stats(local, sdata, ftm_stats);
+
+	return ret;
+}
+
 static inline int drv_start_nan(struct ieee80211_local *local,
 				struct ieee80211_sub_if_data *sdata,
 				struct cfg80211_nan_conf *conf)
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 0ab69a1964f8..588c51a67c89 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2600,6 +2600,29 @@ TRACE_EVENT(drv_wake_tx_queue,
 	)
 );
 
+TRACE_EVENT(drv_get_ftm_responder_stats,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct cfg80211_ftm_responder_stats *ftm_stats),
+
+	TP_ARGS(local, sdata, ftm_stats),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT VIF_PR_FMT,
+		LOCAL_PR_ARG, VIF_PR_ARG
+	)
+);
+
 #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index ef5d1f60a63b..bec424316ea4 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2178,6 +2178,11 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		case NL80211_IFTYPE_AP:
 			changed |= BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS;
 
+			if (sdata->vif.bss_conf.ftm_responder == 1 &&
+			    wiphy_ext_feature_isset(sdata->local->hw.wiphy,
+					NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER))
+				changed |= BSS_CHANGED_FTM_RESPONDER;
+
 			if (sdata->vif.type == NL80211_IFTYPE_AP) {
 				changed |= BSS_CHANGED_AP_PROBE_RESP;
 
-- 
cgit v1.2.3


From 0d4e14a32dcab9c4bd559d02874120fbb86b1322 Mon Sep 17 00:00:00 2001
From: Ankita Bajaj <bankita@codeaurora.org>
Date: Thu, 27 Sep 2018 18:01:57 +0300
Subject: nl80211: Add per peer statistics to compute FCS error rate

Add support for drivers to report the total number of MPDUs received
and the number of MPDUs received with an FCS error from a specific
peer. These counters will be incremented only when the TA of the
frame matches the MAC address of the peer irrespective of FCS
error.

It should be noted that the TA field in the frame might be corrupted
when there is an FCS error and TA matching logic would fail in such
cases. Hence, FCS error counter might not be fully accurate, but it can
provide help in detecting bad RX links in significant number of cases.
This FCS error counter without full accuracy can be used, e.g., to
trigger a kick-out of a connected client with a bad link in AP mode to
force such a client to roam to another AP.

Signed-off-by: Ankita Bajaj <bankita@codeaurora.org>
Signed-off-by: Jouni Malinen <jouni@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 7 +++++++
 include/uapi/linux/nl80211.h | 8 ++++++++
 net/wireless/nl80211.c       | 2 ++
 3 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0e16e723dcef..1fa41b7a1be3 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1303,6 +1303,10 @@ struct cfg80211_tid_stats {
  * @ack_signal: signal strength (in dBm) of the last ACK frame.
  * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
  *	been sent.
+ * @rx_mpdu_count: number of MPDUs received from this station
+ * @fcs_err_count: number of packets (MPDUs) received from this station with
+ *	an FCS error. This counter should be incremented only when TA of the
+ *	received packet with an FCS error matches the peer MAC address.
  */
 struct station_info {
 	u64 filled;
@@ -1349,6 +1353,9 @@ struct station_info {
 	struct cfg80211_tid_stats *pertid;
 	s8 ack_signal;
 	s8 avg_ack_signal;
+
+	u32 rx_mpdu_count;
+	u32 fcs_err_count;
 };
 
 #if IS_ENABLED(CONFIG_CFG80211)
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index dc6d5a1ef470..6d610bae30a9 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3068,6 +3068,12 @@ enum nl80211_sta_bss_param {
  * @NL80211_STA_INFO_PAD: attribute used for padding for 64-bit alignment
  * @NL80211_STA_INFO_ACK_SIGNAL: signal strength of the last ACK frame(u8, dBm)
  * @NL80211_STA_INFO_ACK_SIGNAL_AVG: avg signal strength of ACK frames (s8, dBm)
+ * @NL80211_STA_INFO_RX_MPDUS: total number of received packets (MPDUs)
+ *	(u32, from this station)
+ * @NL80211_STA_INFO_FCS_ERROR_COUNT: total number of packets (MPDUs) received
+ *	with an FCS error (u32, from this station). This count may not include
+ *	some packets with an FCS error due to TA corruption. Hence this counter
+ *	might not be fully accurate.
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
  */
@@ -3108,6 +3114,8 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_PAD,
 	NL80211_STA_INFO_ACK_SIGNAL,
 	NL80211_STA_INFO_ACK_SIGNAL_AVG,
+	NL80211_STA_INFO_RX_MPDUS,
+	NL80211_STA_INFO_FCS_ERROR_COUNT,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 758bb069d000..744b5851bbf9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4761,6 +4761,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 	PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc);
 	PUT_SINFO_U64(BEACON_RX, rx_beacon);
 	PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
+	PUT_SINFO(RX_MPDUS, rx_mpdu_count, u32);
+	PUT_SINFO(FCS_ERROR_COUNT, fcs_err_count, u32);
 	if (wiphy_ext_feature_isset(&rdev->wiphy,
 				    NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT)) {
 		PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
-- 
cgit v1.2.3


From f8252e7b5a83deee0e477fc1e31e3f06ceb35d28 Mon Sep 17 00:00:00 2001
From: Anilkumar Kolli <akolli@codeaurora.org>
Date: Thu, 11 Oct 2018 18:15:03 +0530
Subject: mac80211: implement ieee80211_tx_rate_update to update rate

Current mac80211 has provision to update tx status through
ieee80211_tx_status() and ieee80211_tx_status_ext(). But
drivers like ath10k updates the tx status from the skb except
txrate, txrate will be updated from a different path, peer stats.

Using ieee80211_tx_status_ext() in two different paths
(one for the stats, one for the tx rate) would duplicate
the stats instead.

To avoid this stats duplication, ieee80211_tx_rate_update()
is implemented.

Signed-off-by: Anilkumar Kolli <akolli@codeaurora.org>
[minor commit message editing, use initializers in code]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 15 +++++++++++++++
 net/mac80211/status.c  | 19 +++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 2ccd4d1bef89..71985e95d2d9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4379,6 +4379,21 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif,
 void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta,
 					   u32 thr);
 
+/**
+ * ieee80211_tx_rate_update - transmit rate update callback
+ *
+ * Drivers should call this functions with a non-NULL pub sta
+ * This function can be used in drivers that does not have provision
+ * in updating the tx rate in data path.
+ *
+ * @hw: the hardware the frame was transmitted by
+ * @pubsta: the station to update the tx rate for.
+ * @info: tx status information
+ */
+void ieee80211_tx_rate_update(struct ieee80211_hw *hw,
+			      struct ieee80211_sta *pubsta,
+			      struct ieee80211_tx_info *info);
+
 /**
  * ieee80211_tx_status - transmit status callback
  *
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 91d7c0cd1882..aa4afbf0abaf 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -987,6 +987,25 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL(ieee80211_tx_status_ext);
 
+void ieee80211_tx_rate_update(struct ieee80211_hw *hw,
+			      struct ieee80211_sta *pubsta,
+			      struct ieee80211_tx_info *info)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_supported_band *sband = hw->wiphy->bands[info->band];
+	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+	struct ieee80211_tx_status status = {
+		.info = info,
+		.sta = pubsta,
+	};
+
+	rate_control_tx_status(local, sband, &status);
+
+	if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL))
+		sta->tx_stats.last_rate = info->status.rates[0];
+}
+EXPORT_SYMBOL(ieee80211_tx_rate_update);
+
 void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets)
 {
 	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
-- 
cgit v1.2.3


From 7c6bb7d2faaf1ed7d78bafd712476e4cf2cf0d7d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 11 Oct 2018 20:17:21 -0700
Subject: net/ipv6: Add knob to skip DELROUTE message on device down

Another difference between IPv4 and IPv6 is the generation of RTM_DELROUTE
notifications when a device is taken down (admin down) or deleted. IPv4
does not generate a message for routes evicted by the down or delete;
IPv6 does. A NOS at scale really needs to avoid these messages and have
IPv4 and IPv6 behave similarly, relying on userspace to handle link
notifications and evict the routes.

At this point existing user behavior needs to be preserved. Since
notifications are a global action (not per app) the only way to preserve
existing behavior and allow the messages to be skipped is to add a new
sysctl (net/ipv6/route/skip_notify_on_dev_down) which can be set to
disable the notificatioons.

IPv6 route code already supports the option to skip the message (it is
used for multipath routes for example). Besides the new sysctl we need
to pass the skip_notify setting through the generic fib6_clean and
fib6_walk functions to fib6_clean_node and to set skip_notify on calls
to __ip_del_rt for the addrconf_ifdown path.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  8 ++++++++
 include/net/ip6_fib.h                  |  3 +++
 include/net/netns/ipv6.h               |  1 +
 net/ipv6/ip6_fib.c                     | 20 +++++++++++++++-----
 net/ipv6/route.c                       | 20 +++++++++++++++++++-
 5 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 960de8fe3f40..163b5ff1073c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1442,6 +1442,14 @@ max_hbh_length - INTEGER
 	header.
 	Default: INT_MAX (unlimited)
 
+skip_notify_on_dev_down - BOOLEAN
+	Controls whether an RTM_DELROUTE message is generated for routes
+	removed when a device is taken down or deleted. IPv4 does not
+	generate this message; IPv6 does by default. Setting this sysctl
+	to true skips the message, making IPv4 and IPv6 on par in relying
+	on userspace caches to track link events and evict routes.
+	Default: false (generate message)
+
 IPv6 Fragmentation:
 
 ip6frag_high_thresh - INTEGER
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index f06e968f1992..caabfd84a098 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -407,6 +407,9 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 
 void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg),
 		    void *arg);
+void fib6_clean_all_skip_notify(struct net *net,
+				int (*func)(struct fib6_info *, void *arg),
+				void *arg);
 
 int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack);
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index f0e396ab9bec..ef1ed529f33c 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -45,6 +45,7 @@ struct netns_sysctl_ipv6 {
 	int max_dst_opts_len;
 	int max_hbh_opts_len;
 	int seg6_flowlabel;
+	bool skip_notify_on_dev_down;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e14d244c551f..9ba72d94d60f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -47,6 +47,7 @@ struct fib6_cleaner {
 	int (*func)(struct fib6_info *, void *arg);
 	int sernum;
 	void *arg;
+	bool skip_notify;
 };
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1956,6 +1957,7 @@ static int fib6_clean_node(struct fib6_walker *w)
 	struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
 	struct nl_info info = {
 		.nl_net = c->net,
+		.skip_notify = c->skip_notify,
 	};
 
 	if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
@@ -2007,7 +2009,7 @@ static int fib6_clean_node(struct fib6_walker *w)
 
 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 			    int (*func)(struct fib6_info *, void *arg),
-			    int sernum, void *arg)
+			    int sernum, void *arg, bool skip_notify)
 {
 	struct fib6_cleaner c;
 
@@ -2019,13 +2021,14 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 	c.sernum = sernum;
 	c.arg = arg;
 	c.net = net;
+	c.skip_notify = skip_notify;
 
 	fib6_walk(net, &c.w);
 }
 
 static void __fib6_clean_all(struct net *net,
 			     int (*func)(struct fib6_info *, void *),
-			     int sernum, void *arg)
+			     int sernum, void *arg, bool skip_notify)
 {
 	struct fib6_table *table;
 	struct hlist_head *head;
@@ -2037,7 +2040,7 @@ static void __fib6_clean_all(struct net *net,
 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
 			spin_lock_bh(&table->tb6_lock);
 			fib6_clean_tree(net, &table->tb6_root,
-					func, sernum, arg);
+					func, sernum, arg, skip_notify);
 			spin_unlock_bh(&table->tb6_lock);
 		}
 	}
@@ -2047,14 +2050,21 @@ static void __fib6_clean_all(struct net *net,
 void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
 		    void *arg)
 {
-	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
+	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
+}
+
+void fib6_clean_all_skip_notify(struct net *net,
+				int (*func)(struct fib6_info *, void *),
+				void *arg)
+{
+	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
 }
 
 static void fib6_flush_trees(struct net *net)
 {
 	int new_sernum = fib6_new_sernum(net);
 
-	__fib6_clean_all(net, NULL, new_sernum, NULL);
+	__fib6_clean_all(net, NULL, new_sernum, NULL, false);
 }
 
 /*
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bf4cd647d8b8..f4e08b0689a8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4026,8 +4026,12 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
 			.event = event,
 		},
 	};
+	struct net *net = dev_net(dev);
 
-	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
+	if (net->ipv6.sysctl.skip_notify_on_dev_down)
+		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
+	else
+		fib6_clean_all(net, fib6_ifdown, &arg);
 }
 
 void rt6_disable_ip(struct net_device *dev, unsigned long event)
@@ -5031,6 +5035,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
 	return 0;
 }
 
+static int zero;
+static int one = 1;
+
 static struct ctl_table ipv6_route_table_template[] = {
 	{
 		.procname	=	"flush",
@@ -5102,6 +5109,15 @@ static struct ctl_table ipv6_route_table_template[] = {
 		.mode		=	0644,
 		.proc_handler	=	proc_dointvec_ms_jiffies,
 	},
+	{
+		.procname	=	"skip_notify_on_dev_down",
+		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec,
+		.extra1		=	&zero,
+		.extra2		=	&one,
+	},
 	{ }
 };
 
@@ -5125,6 +5141,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
 
 		/* Don't export sysctls to unprivileged users */
 		if (net->user_ns != &init_user_ns)
@@ -5189,6 +5206,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
 
 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
 
-- 
cgit v1.2.3


From 859bd2ef1fc1110a8031b967ee656c53a6260a76 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 11 Oct 2018 20:33:49 -0700
Subject: net: Evict neighbor entries on carrier down

When a link's carrier goes down it could be a sign of the port changing
networks. If the new network has overlapping addresses with the old one,
then the kernel will continue trying to use neighbor entries established
based on the old network until the entries finally age out - meaning a
potentially long delay with communications not working.

This patch evicts neighbor entries on carrier down with the exception of
those marked permanent. Permanent entries are managed by userspace (either
an admin or a routing daemon such as FRR).

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h |  1 +
 net/core/neighbour.c    | 27 +++++++++++++++++++++++----
 net/ipv4/arp.c          |  2 ++
 net/ipv6/ndisc.c        |  2 ++
 4 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 0874f7fcd859..f58b384aa6c9 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -323,6 +323,7 @@ void __neigh_set_probe_once(struct neighbour *neigh);
 bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl);
 void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
 int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
+int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev);
 int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb);
 int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb);
 int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index dc1389b8beb1..69c41cb3966d 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -232,7 +232,8 @@ static void pneigh_queue_purge(struct sk_buff_head *list)
 	}
 }
 
-static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
+static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
+			    bool skip_perm)
 {
 	int i;
 	struct neigh_hash_table *nht;
@@ -250,6 +251,10 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
 				np = &n->next;
 				continue;
 			}
+			if (skip_perm && n->nud_state & NUD_PERMANENT) {
+				np = &n->next;
+				continue;
+			}
 			rcu_assign_pointer(*np,
 				   rcu_dereference_protected(n->next,
 						lockdep_is_held(&tbl->lock)));
@@ -285,21 +290,35 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
 void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
 {
 	write_lock_bh(&tbl->lock);
-	neigh_flush_dev(tbl, dev);
+	neigh_flush_dev(tbl, dev, false);
 	write_unlock_bh(&tbl->lock);
 }
 EXPORT_SYMBOL(neigh_changeaddr);
 
-int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
+			  bool skip_perm)
 {
 	write_lock_bh(&tbl->lock);
-	neigh_flush_dev(tbl, dev);
+	neigh_flush_dev(tbl, dev, skip_perm);
 	pneigh_ifdown_and_unlock(tbl, dev);
 
 	del_timer_sync(&tbl->proxy_timer);
 	pneigh_queue_purge(&tbl->proxy_queue);
 	return 0;
 }
+
+int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev)
+{
+	__neigh_ifdown(tbl, dev, true);
+	return 0;
+}
+EXPORT_SYMBOL(neigh_carrier_down);
+
+int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+	__neigh_ifdown(tbl, dev, false);
+	return 0;
+}
 EXPORT_SYMBOL(neigh_ifdown);
 
 static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index e90c89ef8c08..850a6f13a082 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1255,6 +1255,8 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event,
 		change_info = ptr;
 		if (change_info->flags_changed & IFF_NOARP)
 			neigh_changeaddr(&arp_tbl, dev);
+		if (!netif_carrier_ok(dev))
+			neigh_carrier_down(&arp_tbl, dev);
 		break;
 	default:
 		break;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 51863ada15a4..a25cfdd47c89 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1784,6 +1784,8 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
 		change_info = ptr;
 		if (change_info->flags_changed & IFF_NOARP)
 			neigh_changeaddr(&nd_tbl, dev);
+		if (!netif_carrier_ok(dev))
+			neigh_carrier_down(&nd_tbl, dev);
 		break;
 	case NETDEV_DOWN:
 		neigh_ifdown(&nd_tbl, dev);
-- 
cgit v1.2.3


From 9163a0fc1f0c0980f117cc25f4fa6ba9b0750a36 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 12 Oct 2018 13:41:16 +0300
Subject: net: bridge: add support for per-port vlan stats

This patch adds an option to have per-port vlan stats instead of the
default global stats. The option can be set only when there are no port
vlans in the bridge since we need to allocate the stats if it is set
when vlans are being added to ports (and respectively free them
when being deleted). Also bump RTNL_MAX_TYPE as the bridge is the
largest user of options. The current stats design allows us to add
these without any changes to the fast-path, it all comes down to
the per-vlan stats pointer which, if this option is enabled, will
be allocated for each port vlan instead of using the global bridge-wide
one.

CC: bridge@lists.linux-foundation.org
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_netlink.c      | 14 ++++++++++++-
 net/bridge/br_private.h      |  2 ++
 net/bridge/br_sysfs_br.c     | 17 +++++++++++++++
 net/bridge/br_vlan.c         | 49 ++++++++++++++++++++++++++++++++++++++++++--
 net/core/rtnetlink.c         |  2 +-
 6 files changed, 81 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 58faab897201..1debfa42cba1 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -287,6 +287,7 @@ enum {
 	IFLA_BR_MCAST_STATS_ENABLED,
 	IFLA_BR_MCAST_IGMP_VERSION,
 	IFLA_BR_MCAST_MLD_VERSION,
+	IFLA_BR_VLAN_STATS_PER_PORT,
 	__IFLA_BR_MAX,
 };
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index e5a5bc5d5232..3345f1984542 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1034,6 +1034,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
 	[IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 },
 	[IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
 	[IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
+	[IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 },
 };
 
 static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1114,6 +1115,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
 		if (err)
 			return err;
 	}
+
+	if (data[IFLA_BR_VLAN_STATS_PER_PORT]) {
+		__u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]);
+
+		err = br_vlan_set_stats_per_port(br, per_port);
+		if (err)
+			return err;
+	}
 #endif
 
 	if (data[IFLA_BR_GROUP_FWD_MASK]) {
@@ -1327,6 +1336,7 @@ static size_t br_get_size(const struct net_device *brdev)
 	       nla_total_size(sizeof(__be16)) +	/* IFLA_BR_VLAN_PROTOCOL */
 	       nla_total_size(sizeof(u16)) +    /* IFLA_BR_VLAN_DEFAULT_PVID */
 	       nla_total_size(sizeof(u8)) +     /* IFLA_BR_VLAN_STATS_ENABLED */
+	       nla_total_size(sizeof(u8)) +	/* IFLA_BR_VLAN_STATS_PER_PORT */
 #endif
 	       nla_total_size(sizeof(u16)) +    /* IFLA_BR_GROUP_FWD_MASK */
 	       nla_total_size(sizeof(struct ifla_bridge_id)) +   /* IFLA_BR_ROOT_ID */
@@ -1417,7 +1427,9 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) ||
 	    nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) ||
 	    nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED,
-		       br_opt_get(br, BROPT_VLAN_STATS_ENABLED)))
+		       br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) ||
+	    nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT,
+		       br_opt_get(br, IFLA_BR_VLAN_STATS_PER_PORT)))
 		return -EMSGSIZE;
 #endif
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 57229b9d800f..10ee39fdca5c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -320,6 +320,7 @@ enum net_bridge_opts {
 	BROPT_HAS_IPV6_ADDR,
 	BROPT_NEIGH_SUPPRESS_ENABLED,
 	BROPT_MTU_SET_BY_USER,
+	BROPT_VLAN_STATS_PER_PORT,
 };
 
 struct net_bridge {
@@ -859,6 +860,7 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
 int __br_vlan_set_proto(struct net_bridge *br, __be16 proto);
 int br_vlan_set_proto(struct net_bridge *br, unsigned long val);
 int br_vlan_set_stats(struct net_bridge *br, unsigned long val);
+int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val);
 int br_vlan_init(struct net_bridge *br);
 int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val);
 int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid);
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index c93c5724609e..60182bef6341 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -803,6 +803,22 @@ static ssize_t vlan_stats_enabled_store(struct device *d,
 	return store_bridge_parm(d, buf, len, br_vlan_set_stats);
 }
 static DEVICE_ATTR_RW(vlan_stats_enabled);
+
+static ssize_t vlan_stats_per_port_show(struct device *d,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT));
+}
+
+static ssize_t vlan_stats_per_port_store(struct device *d,
+					 struct device_attribute *attr,
+					 const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, br_vlan_set_stats_per_port);
+}
+static DEVICE_ATTR_RW(vlan_stats_per_port);
 #endif
 
 static struct attribute *bridge_attrs[] = {
@@ -856,6 +872,7 @@ static struct attribute *bridge_attrs[] = {
 	&dev_attr_vlan_protocol.attr,
 	&dev_attr_default_pvid.attr,
 	&dev_attr_vlan_stats_enabled.attr,
+	&dev_attr_vlan_stats_per_port.attr,
 #endif
 	NULL
 };
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 5942e03dd845..9b707234e4ae 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -190,6 +190,19 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv)
 	}
 }
 
+static void nbp_vlan_rcu_free(struct rcu_head *rcu)
+{
+	struct net_bridge_vlan *v;
+
+	v = container_of(rcu, struct net_bridge_vlan, rcu);
+	WARN_ON(br_vlan_is_master(v));
+	/* if we had per-port stats configured then free them here */
+	if (v->brvlan->stats != v->stats)
+		free_percpu(v->stats);
+	v->stats = NULL;
+	kfree(v);
+}
+
 /* This is the shared VLAN add function which works for both ports and bridge
  * devices. There are four possible calls to this function in terms of the
  * vlan entry type:
@@ -245,7 +258,15 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
 		if (!masterv)
 			goto out_filt;
 		v->brvlan = masterv;
-		v->stats = masterv->stats;
+		if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) {
+			v->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats);
+			if (!v->stats) {
+				err = -ENOMEM;
+				goto out_filt;
+			}
+		} else {
+			v->stats = masterv->stats;
+		}
 	} else {
 		err = br_switchdev_port_vlan_add(dev, v->vid, flags);
 		if (err && err != -EOPNOTSUPP)
@@ -329,7 +350,7 @@ static int __vlan_del(struct net_bridge_vlan *v)
 		rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
 				       br_vlan_rht_params);
 		__vlan_del_list(v);
-		kfree_rcu(v, rcu);
+		call_rcu(&v->rcu, nbp_vlan_rcu_free);
 	}
 
 	br_vlan_put_master(masterv);
@@ -830,6 +851,30 @@ int br_vlan_set_stats(struct net_bridge *br, unsigned long val)
 	return 0;
 }
 
+int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val)
+{
+	struct net_bridge_port *p;
+
+	/* allow to change the option if there are no port vlans configured */
+	list_for_each_entry(p, &br->port_list, list) {
+		struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
+
+		if (vg->num_vlans)
+			return -EBUSY;
+	}
+
+	switch (val) {
+	case 0:
+	case 1:
+		br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid)
 {
 	struct net_bridge_vlan *v;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 46328a10034a..0958c7be2c22 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -59,7 +59,7 @@
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
 
-#define RTNL_MAX_TYPE		48
+#define RTNL_MAX_TYPE		49
 #define RTNL_SLAVE_MAX_TYPE	36
 
 struct rtnl_link {
-- 
cgit v1.2.3


From 5886d932e52acfbe12ea5aac8e7c3ad6f16364d1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 12 Oct 2018 12:53:00 +0200
Subject: netlink: replace __NLA_ENSURE implementation

We already have BUILD_BUG_ON_ZERO() which I just hadn't found
before, so we should use it here instead of open-coding another
implementation thereof.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index f1db8e594847..4c1e99303b5a 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -311,7 +311,7 @@ struct nla_policy {
 #define NLA_POLICY_NESTED_ARRAY(maxattr, policy) \
 	{ .type = NLA_NESTED_ARRAY, .validation_data = policy, .len = maxattr }
 
-#define __NLA_ENSURE(condition) (sizeof(char[1 - 2*!(condition)]) - 1)
+#define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
 #define NLA_ENSURE_INT_TYPE(tp)				\
 	(__NLA_ENSURE(tp == NLA_S8 || tp == NLA_U8 ||	\
 		      tp == NLA_S16 || tp == NLA_U16 ||	\
-- 
cgit v1.2.3


From cc53aabcc283c36274d3f3ce9adc4b40c21d4838 Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Thu, 11 Oct 2018 13:16:01 +0300
Subject: firmware: qcom: scm: Add WLAN VMID for Qualcomm SCM interface

Add WLAN related VMID's to support wlan driver to set up
the remote's permissions call via TrustZone.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Acked-by: Niklas Cassel <niklas.cassel@linaro.org>
Reviewed-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 include/linux/qcom_scm.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h
index 5d65521260b3..06996ad4f2bc 100644
--- a/include/linux/qcom_scm.h
+++ b/include/linux/qcom_scm.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2015, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2010-2015, 2018, The Linux Foundation. All rights reserved.
  * Copyright (C) 2015 Linaro Ltd.
  *
  * This program is free software; you can redistribute it and/or modify
@@ -33,6 +33,8 @@ struct qcom_scm_vmperm {
 
 #define QCOM_SCM_VMID_HLOS       0x3
 #define QCOM_SCM_VMID_MSS_MSA    0xF
+#define QCOM_SCM_VMID_WLAN       0x18
+#define QCOM_SCM_VMID_WLAN_CE    0x19
 #define QCOM_SCM_PERM_READ       0x4
 #define QCOM_SCM_PERM_WRITE      0x2
 #define QCOM_SCM_PERM_EXEC       0x1
-- 
cgit v1.2.3


From 571f739083e2544b343b5998608de679519de4e9 Mon Sep 17 00:00:00 2001
From: Mallikarjun Phulari <mallikarjun.phulari@intel.com>
Date: Fri, 5 Oct 2018 14:48:12 +0530
Subject: Bluetooth: Use separate L2CAP LE credit based connection result
 values

Add the result values specific to L2CAP LE credit based connections
and change the old result values wherever they were used.

Signed-off-by: Mallikarjun Phulari <mallikarjun.phulari@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/l2cap.h | 17 +++++++++++------
 net/bluetooth/l2cap_core.c    | 26 +++++++++++++-------------
 2 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 3555440e14fc..ea4b4ec85b78 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -277,12 +277,17 @@ struct l2cap_conn_rsp {
 #define L2CAP_CR_SEC_BLOCK	0x0003
 #define L2CAP_CR_NO_MEM		0x0004
 #define L2CAP_CR_BAD_AMP	0x0005
-#define L2CAP_CR_AUTHENTICATION	0x0005
-#define L2CAP_CR_AUTHORIZATION	0x0006
-#define L2CAP_CR_BAD_KEY_SIZE	0x0007
-#define L2CAP_CR_ENCRYPTION	0x0008
-#define L2CAP_CR_INVALID_SCID	0x0009
-#define L2CAP_CR_SCID_IN_USE	0x000A
+
+/* credit based connect results */
+#define L2CAP_CR_LE_SUCCESS		0x0000
+#define L2CAP_CR_LE_BAD_PSM		0x0002
+#define L2CAP_CR_LE_NO_MEM		0x0004
+#define L2CAP_CR_LE_AUTHENTICATION	0x0005
+#define L2CAP_CR_LE_AUTHORIZATION	0x0006
+#define L2CAP_CR_LE_BAD_KEY_SIZE	0x0007
+#define L2CAP_CR_LE_ENCRYPTION		0x0008
+#define L2CAP_CR_LE_INVALID_SCID	0x0009
+#define L2CAP_CR_LE_SCID_IN_USE		0X000A
 
 /* connect/create channel status */
 #define L2CAP_CS_NO_INFO	0x0000
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 514899f7f0d4..cf03a0122b2b 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -680,9 +680,9 @@ static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan)
 	u16 result;
 
 	if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
-		result = L2CAP_CR_AUTHORIZATION;
+		result = L2CAP_CR_LE_AUTHORIZATION;
 	else
-		result = L2CAP_CR_BAD_PSM;
+		result = L2CAP_CR_LE_BAD_PSM;
 
 	l2cap_state_change(chan, BT_DISCONN);
 
@@ -3670,7 +3670,7 @@ void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan)
 	rsp.mtu     = cpu_to_le16(chan->imtu);
 	rsp.mps     = cpu_to_le16(chan->mps);
 	rsp.credits = cpu_to_le16(chan->rx_credits);
-	rsp.result  = cpu_to_le16(L2CAP_CR_SUCCESS);
+	rsp.result  = cpu_to_le16(L2CAP_CR_LE_SUCCESS);
 
 	l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp),
 		       &rsp);
@@ -5280,7 +5280,7 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
 	credits = __le16_to_cpu(rsp->credits);
 	result  = __le16_to_cpu(rsp->result);
 
-	if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23 ||
+	if (result == L2CAP_CR_LE_SUCCESS && (mtu < 23 || mps < 23 ||
 					   dcid < L2CAP_CID_DYN_START ||
 					   dcid > L2CAP_CID_LE_DYN_END))
 		return -EPROTO;
@@ -5301,7 +5301,7 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
 	l2cap_chan_lock(chan);
 
 	switch (result) {
-	case L2CAP_CR_SUCCESS:
+	case L2CAP_CR_LE_SUCCESS:
 		if (__l2cap_get_chan_by_dcid(conn, dcid)) {
 			err = -EBADSLT;
 			break;
@@ -5315,8 +5315,8 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
 		l2cap_chan_ready(chan);
 		break;
 
-	case L2CAP_CR_AUTHENTICATION:
-	case L2CAP_CR_ENCRYPTION:
+	case L2CAP_CR_LE_AUTHENTICATION:
+	case L2CAP_CR_LE_ENCRYPTION:
 		/* If we already have MITM protection we can't do
 		 * anything.
 		 */
@@ -5459,7 +5459,7 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
 	pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
 					 &conn->hcon->dst, LE_LINK);
 	if (!pchan) {
-		result = L2CAP_CR_BAD_PSM;
+		result = L2CAP_CR_LE_BAD_PSM;
 		chan = NULL;
 		goto response;
 	}
@@ -5469,28 +5469,28 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
 
 	if (!smp_sufficient_security(conn->hcon, pchan->sec_level,
 				     SMP_ALLOW_STK)) {
-		result = L2CAP_CR_AUTHENTICATION;
+		result = L2CAP_CR_LE_AUTHENTICATION;
 		chan = NULL;
 		goto response_unlock;
 	}
 
 	/* Check for valid dynamic CID range */
 	if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) {
-		result = L2CAP_CR_INVALID_SCID;
+		result = L2CAP_CR_LE_INVALID_SCID;
 		chan = NULL;
 		goto response_unlock;
 	}
 
 	/* Check if we already have channel with that dcid */
 	if (__l2cap_get_chan_by_dcid(conn, scid)) {
-		result = L2CAP_CR_SCID_IN_USE;
+		result = L2CAP_CR_LE_SCID_IN_USE;
 		chan = NULL;
 		goto response_unlock;
 	}
 
 	chan = pchan->ops->new_connection(pchan);
 	if (!chan) {
-		result = L2CAP_CR_NO_MEM;
+		result = L2CAP_CR_LE_NO_MEM;
 		goto response_unlock;
 	}
 
@@ -5526,7 +5526,7 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
 		chan->ops->defer(chan);
 	} else {
 		l2cap_chan_ready(chan);
-		result = L2CAP_CR_SUCCESS;
+		result = L2CAP_CR_LE_SUCCESS;
 	}
 
 response_unlock:
-- 
cgit v1.2.3


From dd1a8f8a88eecbc903f9ffff12332bec6d3f3be3 Mon Sep 17 00:00:00 2001
From: Mallikarjun Phulari <mallikarjun.phulari@intel.com>
Date: Fri, 5 Oct 2018 14:48:13 +0530
Subject: Bluetooth: Errata Service Release 8, Erratum 3253
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

L2CAP: New result values
	0x0006 - Connection refused – Invalid Source CID
	0x0007 - Connection refused – Source CID already allocated

As per the ESR08_V1.0.0, 1.11.2 Erratum 3253, Page No. 54,
"Remote CID invalid Issue".
Applies to Core Specification versions: V5.0, V4.2, v4.1, v4.0, and v3.0 + HS
Vol 3, Part A, Section 4.2, 4.3, 4.14, 4.15.

Core Specification Version 5.0, Page No.1753, Table 4.6 and
Page No. 1767, Table 4.14

New result values are added to l2cap connect/create channel response as
0x0006 - Connection refused – Invalid Source CID
0x0007 - Connection refused – Source CID already allocated

Signed-off-by: Mallikarjun Phulari <mallikarjun.phulari@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/l2cap.h |  2 ++
 net/bluetooth/l2cap_core.c    | 10 +++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index ea4b4ec85b78..093aedebdf0c 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -277,6 +277,8 @@ struct l2cap_conn_rsp {
 #define L2CAP_CR_SEC_BLOCK	0x0003
 #define L2CAP_CR_NO_MEM		0x0004
 #define L2CAP_CR_BAD_AMP	0x0005
+#define L2CAP_CR_INVALID_SCID	0x0006
+#define L2CAP_CR_SCID_IN_USE	0x0007
 
 /* credit based connect results */
 #define L2CAP_CR_LE_SUCCESS		0x0000
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index cf03a0122b2b..2146e0f3b6f8 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -3816,9 +3816,17 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
 
 	result = L2CAP_CR_NO_MEM;
 
+	/* Check for valid dynamic CID range (as per Erratum 3253) */
+	if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_DYN_END) {
+		result = L2CAP_CR_INVALID_SCID;
+		goto response;
+	}
+
 	/* Check if we already have channel with that dcid */
-	if (__l2cap_get_chan_by_dcid(conn, scid))
+	if (__l2cap_get_chan_by_dcid(conn, scid)) {
+		result = L2CAP_CR_SCID_IN_USE;
 		goto response;
+	}
 
 	chan = pchan->ops->new_connection(pchan);
 	if (!chan)
-- 
cgit v1.2.3


From 1243a51f6c05ecbb2c5c9e02fdcc1e7a06f76f26 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 13 Oct 2018 02:45:57 +0200
Subject: tcp, ulp: remove ulp bits from sockmap

In order to prepare sockmap logic to be used in combination with kTLS
we need to detangle it from ULP, and further split it in later commits
into a generic API.

Joint work with John.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/tcp.h    |  1 -
 kernel/bpf/sockmap.c | 39 ++++++++---------------------
 net/ipv4/tcp_ulp.c   | 71 ++++++++++------------------------------------------
 3 files changed, 23 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0d2929223c70..8f5cef67fd35 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2057,7 +2057,6 @@ struct tcp_ulp_ops {
 int tcp_register_ulp(struct tcp_ulp_ops *type);
 void tcp_unregister_ulp(struct tcp_ulp_ops *type);
 int tcp_set_ulp(struct sock *sk, const char *name);
-int tcp_set_ulp_id(struct sock *sk, const int ulp);
 void tcp_get_available_ulp(char *buf, size_t len);
 void tcp_cleanup_ulp(struct sock *sk);
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 5d0677d808ae..de6f7a65c72b 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -182,6 +182,7 @@ enum {
 static struct proto *saved_tcpv6_prot __read_mostly;
 static DEFINE_SPINLOCK(tcpv6_prot_lock);
 static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
+
 static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
 			 struct proto *base)
 {
@@ -239,6 +240,13 @@ static int bpf_tcp_init(struct sock *sk)
 	return 0;
 }
 
+static int __init bpf_sock_init(void)
+{
+	build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot);
+	return 0;
+}
+core_initcall(bpf_sock_init);
+
 static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
 static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge);
 
@@ -413,15 +421,6 @@ enum __sk_action {
 	__SK_NONE,
 };
 
-static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
-	.name		= "bpf_tcp",
-	.uid		= TCP_ULP_BPF,
-	.user_visible	= false,
-	.owner		= NULL,
-	.init		= bpf_tcp_init,
-	.release	= bpf_tcp_release,
-};
-
 static int memcopy_from_iter(struct sock *sk,
 			     struct sk_msg_buff *md,
 			     struct iov_iter *from, int bytes)
@@ -1236,16 +1235,6 @@ static void bpf_tcp_msg_add(struct smap_psock *psock,
 		bpf_prog_put(orig_tx_msg);
 }
 
-static int bpf_tcp_ulp_register(void)
-{
-	build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot);
-	/* Once BPF TX ULP is registered it is never unregistered. It
-	 * will be in the ULP list for the lifetime of the system. Doing
-	 * duplicate registers is not a problem.
-	 */
-	return tcp_register_ulp(&bpf_tcp_ulp_ops);
-}
-
 static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 {
 	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -1491,7 +1480,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
 {
 	if (refcount_dec_and_test(&psock->refcnt)) {
 		if (psock_is_smap_sk(sock))
-			tcp_cleanup_ulp(sock);
+			bpf_tcp_release(sock);
 		write_lock_bh(&sock->sk_callback_lock);
 		smap_stop_sock(psock, sock);
 		write_unlock_bh(&sock->sk_callback_lock);
@@ -1666,10 +1655,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
-	err = bpf_tcp_ulp_register();
-	if (err && err != -EEXIST)
-		return ERR_PTR(err);
-
 	stab = kzalloc(sizeof(*stab), GFP_USER);
 	if (!stab)
 		return ERR_PTR(-ENOMEM);
@@ -1951,7 +1936,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 	if (tx_msg)
 		bpf_tcp_msg_add(psock, sock, tx_msg);
 	if (new) {
-		err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
+		err = bpf_tcp_init(sock);
 		if (err)
 			goto out_free;
 	}
@@ -2187,10 +2172,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
 		 */
 		return ERR_PTR(-E2BIG);
 
-	err = bpf_tcp_ulp_register();
-	if (err && err != -EEXIST)
-		return ERR_PTR(err);
-
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 34e96353f115..a9162aa11af9 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -6,7 +6,7 @@
  *
  */
 
-#include<linux/module.h>
+#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
 #include <linux/list.h>
@@ -29,18 +29,6 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
 	return NULL;
 }
 
-static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp)
-{
-	struct tcp_ulp_ops *e;
-
-	list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
-		if (e->uid == ulp)
-			return e;
-	}
-
-	return NULL;
-}
-
 static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
 {
 	const struct tcp_ulp_ops *ulp = NULL;
@@ -63,18 +51,6 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
 	return ulp;
 }
 
-static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid)
-{
-	const struct tcp_ulp_ops *ulp;
-
-	rcu_read_lock();
-	ulp = tcp_ulp_find_id(uid);
-	if (!ulp || !try_module_get(ulp->owner))
-		ulp = NULL;
-	rcu_read_unlock();
-	return ulp;
-}
-
 /* Attach new upper layer protocol to the list
  * of available protocols.
  */
@@ -135,56 +111,35 @@ void tcp_cleanup_ulp(struct sock *sk)
 	icsk->icsk_ulp_ops = NULL;
 }
 
-/* Change upper layer protocol for socket */
-int tcp_set_ulp(struct sock *sk, const char *name)
+static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
-	const struct tcp_ulp_ops *ulp_ops;
-	int err = 0;
+	int err;
 
-	sock_owned_by_me(sk);
+	err = -EEXIST;
 	if (icsk->icsk_ulp_ops)
-		return -EEXIST;
-
-	ulp_ops = __tcp_ulp_find_autoload(name);
-	if (!ulp_ops)
-		return -ENOENT;
-
-	if (!ulp_ops->user_visible) {
-		module_put(ulp_ops->owner);
-		return -ENOENT;
-	}
+		goto out_err;
 
 	err = ulp_ops->init(sk);
-	if (err) {
-		module_put(ulp_ops->owner);
-		return err;
-	}
+	if (err)
+		goto out_err;
 
 	icsk->icsk_ulp_ops = ulp_ops;
 	return 0;
+out_err:
+	module_put(ulp_ops->owner);
+	return err;
 }
 
-int tcp_set_ulp_id(struct sock *sk, int ulp)
+int tcp_set_ulp(struct sock *sk, const char *name)
 {
-	struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_ulp_ops *ulp_ops;
-	int err;
 
 	sock_owned_by_me(sk);
-	if (icsk->icsk_ulp_ops)
-		return -EEXIST;
 
-	ulp_ops = __tcp_ulp_lookup(ulp);
+	ulp_ops = __tcp_ulp_find_autoload(name);
 	if (!ulp_ops)
 		return -ENOENT;
 
-	err = ulp_ops->init(sk);
-	if (err) {
-		module_put(ulp_ops->owner);
-		return err;
-	}
-
-	icsk->icsk_ulp_ops = ulp_ops;
-	return 0;
+	return __tcp_set_ulp(sk, ulp_ops);
 }
-- 
cgit v1.2.3


From 604326b41a6fb9b4a78b6179335decee0365cd8c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 13 Oct 2018 02:45:58 +0200
Subject: bpf, sockmap: convert to generic sk_msg interface

Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.

This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.

Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.

The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.

Joint work with John.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h       |   33 +-
 include/linux/bpf_types.h |    2 +-
 include/linux/filter.h    |   21 -
 include/linux/skmsg.h     |  371 +++++++
 include/net/tcp.h         |   27 +
 kernel/bpf/Makefile       |    5 -
 kernel/bpf/core.c         |    2 -
 kernel/bpf/sockmap.c      | 2610 ---------------------------------------------
 kernel/bpf/syscall.c      |    6 +-
 net/Kconfig               |   11 +
 net/core/Makefile         |    2 +
 net/core/filter.c         |  270 ++---
 net/core/skmsg.c          |  763 +++++++++++++
 net/core/sock_map.c       | 1002 +++++++++++++++++
 net/ipv4/Makefile         |    1 +
 net/ipv4/tcp_bpf.c        |  655 ++++++++++++
 net/strparser/Kconfig     |    4 +-
 17 files changed, 2925 insertions(+), 2860 deletions(-)
 create mode 100644 include/linux/skmsg.h
 delete mode 100644 kernel/bpf/sockmap.c
 create mode 100644 net/core/skmsg.c
 create mode 100644 net/core/sock_map.c
 create mode 100644 net/ipv4/tcp_bpf.c

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9b558713447f..e60fff48288b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -737,33 +737,18 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 }
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
-#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
-struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
-struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
-int sockmap_get_from_fd(const union bpf_attr *attr, int type,
-			struct bpf_prog *prog);
+#if defined(CONFIG_BPF_STREAM_PARSER)
+int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which);
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
 #else
-static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
-{
-	return NULL;
-}
-
-static inline struct sock  *__sock_hash_lookup_elem(struct bpf_map *map,
-						    void *key)
-{
-	return NULL;
-}
-
-static inline int sock_map_prog(struct bpf_map *map,
-				struct bpf_prog *prog,
-				u32 type)
+static inline int sock_map_prog_update(struct bpf_map *map,
+				       struct bpf_prog *prog, u32 which)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type,
-				      struct bpf_prog *prog)
+static inline int sock_map_get_from_fd(const union bpf_attr *attr,
+				       struct bpf_prog *prog)
 {
 	return -EINVAL;
 }
@@ -839,6 +824,10 @@ extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
+extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
+extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
+extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
+extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
 
 extern const struct bpf_func_proto bpf_get_local_storage_proto;
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5432f4c9f50e..fa48343a5ea1 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -57,7 +57,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
-#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
+#if defined(CONFIG_BPF_STREAM_PARSER)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 #endif
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6791a0ac0139..5771874bc01e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -520,24 +520,6 @@ struct bpf_skb_data_end {
 	void *data_end;
 };
 
-struct sk_msg_buff {
-	void *data;
-	void *data_end;
-	__u32 apply_bytes;
-	__u32 cork_bytes;
-	int sg_copybreak;
-	int sg_start;
-	int sg_curr;
-	int sg_end;
-	struct scatterlist sg_data[MAX_SKB_FRAGS];
-	bool sg_copy[MAX_SKB_FRAGS];
-	__u32 flags;
-	struct sock *sk_redir;
-	struct sock *sk;
-	struct sk_buff *skb;
-	struct list_head list;
-};
-
 struct bpf_redirect_info {
 	u32 ifindex;
 	u32 flags;
@@ -833,9 +815,6 @@ void xdp_do_flush_map(void);
 
 void bpf_warn_invalid_xdp_action(u32 act);
 
-struct sock *do_sk_redirect_map(struct sk_buff *skb);
-struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
-
 #ifdef CONFIG_INET
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 				  struct bpf_prog *prog, struct sk_buff *skb,
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
new file mode 100644
index 000000000000..95678103c4a0
--- /dev/null
+++ b/include/linux/skmsg.h
@@ -0,0 +1,371 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#ifndef _LINUX_SKMSG_H
+#define _LINUX_SKMSG_H
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/strparser.h>
+
+#define MAX_MSG_FRAGS			MAX_SKB_FRAGS
+
+enum __sk_action {
+	__SK_DROP = 0,
+	__SK_PASS,
+	__SK_REDIRECT,
+	__SK_NONE,
+};
+
+struct sk_msg_sg {
+	u32				start;
+	u32				curr;
+	u32				end;
+	u32				size;
+	u32				copybreak;
+	bool				copy[MAX_MSG_FRAGS];
+	struct scatterlist		data[MAX_MSG_FRAGS];
+};
+
+struct sk_msg {
+	struct sk_msg_sg		sg;
+	void				*data;
+	void				*data_end;
+	u32				apply_bytes;
+	u32				cork_bytes;
+	u32				flags;
+	struct sk_buff			*skb;
+	struct sock			*sk_redir;
+	struct sock			*sk;
+	struct list_head		list;
+};
+
+struct sk_psock_progs {
+	struct bpf_prog			*msg_parser;
+	struct bpf_prog			*skb_parser;
+	struct bpf_prog			*skb_verdict;
+};
+
+enum sk_psock_state_bits {
+	SK_PSOCK_TX_ENABLED,
+};
+
+struct sk_psock_link {
+	struct list_head		list;
+	struct bpf_map			*map;
+	void				*link_raw;
+};
+
+struct sk_psock_parser {
+	struct strparser		strp;
+	bool				enabled;
+	void (*saved_data_ready)(struct sock *sk);
+};
+
+struct sk_psock_work_state {
+	struct sk_buff			*skb;
+	u32				len;
+	u32				off;
+};
+
+struct sk_psock {
+	struct sock			*sk;
+	struct sock			*sk_redir;
+	u32				apply_bytes;
+	u32				cork_bytes;
+	u32				eval;
+	struct sk_msg			*cork;
+	struct sk_psock_progs		progs;
+	struct sk_psock_parser		parser;
+	struct sk_buff_head		ingress_skb;
+	struct list_head		ingress_msg;
+	unsigned long			state;
+	struct list_head		link;
+	spinlock_t			link_lock;
+	refcount_t			refcnt;
+	void (*saved_unhash)(struct sock *sk);
+	void (*saved_close)(struct sock *sk, long timeout);
+	void (*saved_write_space)(struct sock *sk);
+	struct proto			*sk_proto;
+	struct sk_psock_work_state	work_state;
+	struct work_struct		work;
+	union {
+		struct rcu_head		rcu;
+		struct work_struct	gc;
+	};
+};
+
+int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
+		 int elem_first_coalesce);
+void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len);
+int sk_msg_free(struct sock *sk, struct sk_msg *msg);
+int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg);
+void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes);
+void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
+				  u32 bytes);
+
+void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes);
+
+int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+			      struct sk_msg *msg, u32 bytes);
+int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+			     struct sk_msg *msg, u32 bytes);
+
+static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
+{
+	WARN_ON(i == msg->sg.end && bytes);
+}
+
+static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes)
+{
+	if (psock->apply_bytes) {
+		if (psock->apply_bytes < bytes)
+			psock->apply_bytes = 0;
+		else
+			psock->apply_bytes -= bytes;
+	}
+}
+
+#define sk_msg_iter_var_prev(var)			\
+	do {						\
+		if (var == 0)				\
+			var = MAX_MSG_FRAGS - 1;	\
+		else					\
+			var--;				\
+	} while (0)
+
+#define sk_msg_iter_var_next(var)			\
+	do {						\
+		var++;					\
+		if (var == MAX_MSG_FRAGS)		\
+			var = 0;			\
+	} while (0)
+
+#define sk_msg_iter_prev(msg, which)			\
+	sk_msg_iter_var_prev(msg->sg.which)
+
+#define sk_msg_iter_next(msg, which)			\
+	sk_msg_iter_var_next(msg->sg.which)
+
+static inline void sk_msg_clear_meta(struct sk_msg *msg)
+{
+	memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy));
+}
+
+static inline void sk_msg_init(struct sk_msg *msg)
+{
+	memset(msg, 0, sizeof(*msg));
+	sg_init_marker(msg->sg.data, ARRAY_SIZE(msg->sg.data));
+}
+
+static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src,
+			       int which, u32 size)
+{
+	dst->sg.data[which] = src->sg.data[which];
+	dst->sg.data[which].length  = size;
+	src->sg.data[which].length -= size;
+	src->sg.data[which].offset += size;
+}
+
+static inline u32 sk_msg_elem_used(const struct sk_msg *msg)
+{
+	return msg->sg.end >= msg->sg.start ?
+		msg->sg.end - msg->sg.start :
+		msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start);
+}
+
+static inline bool sk_msg_full(const struct sk_msg *msg)
+{
+	return (msg->sg.end == msg->sg.start) && msg->sg.size;
+}
+
+static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which)
+{
+	return &msg->sg.data[which];
+}
+
+static inline struct page *sk_msg_page(struct sk_msg *msg, int which)
+{
+	return sg_page(sk_msg_elem(msg, which));
+}
+
+static inline bool sk_msg_to_ingress(const struct sk_msg *msg)
+{
+	return msg->flags & BPF_F_INGRESS;
+}
+
+static inline void sk_msg_compute_data_pointers(struct sk_msg *msg)
+{
+	struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start);
+
+	if (msg->sg.copy[msg->sg.start]) {
+		msg->data = NULL;
+		msg->data_end = NULL;
+	} else {
+		msg->data = sg_virt(sge);
+		msg->data_end = msg->data + sge->length;
+	}
+}
+
+static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page,
+				   u32 len, u32 offset)
+{
+	struct scatterlist *sge;
+
+	get_page(page);
+	sge = sk_msg_elem(msg, msg->sg.end);
+	sg_set_page(sge, page, len, offset);
+	sg_unmark_end(sge);
+
+	msg->sg.copy[msg->sg.end] = true;
+	msg->sg.size += len;
+	sk_msg_iter_next(msg, end);
+}
+
+static inline struct sk_psock *sk_psock(const struct sock *sk)
+{
+	return rcu_dereference_sk_user_data(sk);
+}
+
+static inline bool sk_has_psock(struct sock *sk)
+{
+	return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg;
+}
+
+static inline void sk_psock_queue_msg(struct sk_psock *psock,
+				      struct sk_msg *msg)
+{
+	list_add_tail(&msg->list, &psock->ingress_msg);
+}
+
+static inline void sk_psock_report_error(struct sk_psock *psock, int err)
+{
+	struct sock *sk = psock->sk;
+
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+}
+
+struct sk_psock *sk_psock_init(struct sock *sk, int node);
+
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
+
+int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
+			 struct sk_msg *msg);
+
+static inline struct sk_psock_link *sk_psock_init_link(void)
+{
+	return kzalloc(sizeof(struct sk_psock_link),
+		       GFP_ATOMIC | __GFP_NOWARN);
+}
+
+static inline void sk_psock_free_link(struct sk_psock_link *link)
+{
+	kfree(link);
+}
+
+struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
+#if defined(CONFIG_BPF_STREAM_PARSER)
+void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link);
+#else
+static inline void sk_psock_unlink(struct sock *sk,
+				   struct sk_psock_link *link)
+{
+}
+#endif
+
+void __sk_psock_purge_ingress_msg(struct sk_psock *psock);
+
+static inline void sk_psock_cork_free(struct sk_psock *psock)
+{
+	if (psock->cork) {
+		sk_msg_free(psock->sk, psock->cork);
+		kfree(psock->cork);
+		psock->cork = NULL;
+	}
+}
+
+static inline void sk_psock_update_proto(struct sock *sk,
+					 struct sk_psock *psock,
+					 struct proto *ops)
+{
+	psock->saved_unhash = sk->sk_prot->unhash;
+	psock->saved_close = sk->sk_prot->close;
+	psock->saved_write_space = sk->sk_write_space;
+
+	psock->sk_proto = sk->sk_prot;
+	sk->sk_prot = ops;
+}
+
+static inline void sk_psock_restore_proto(struct sock *sk,
+					  struct sk_psock *psock)
+{
+	if (psock->sk_proto) {
+		sk->sk_prot = psock->sk_proto;
+		psock->sk_proto = NULL;
+	}
+}
+
+static inline void sk_psock_set_state(struct sk_psock *psock,
+				      enum sk_psock_state_bits bit)
+{
+	set_bit(bit, &psock->state);
+}
+
+static inline void sk_psock_clear_state(struct sk_psock *psock,
+					enum sk_psock_state_bits bit)
+{
+	clear_bit(bit, &psock->state);
+}
+
+static inline bool sk_psock_test_state(const struct sk_psock *psock,
+				       enum sk_psock_state_bits bit)
+{
+	return test_bit(bit, &psock->state);
+}
+
+static inline struct sk_psock *sk_psock_get(struct sock *sk)
+{
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (psock && !refcount_inc_not_zero(&psock->refcnt))
+		psock = NULL;
+	rcu_read_unlock();
+	return psock;
+}
+
+void sk_psock_stop(struct sock *sk, struct sk_psock *psock);
+void sk_psock_destroy(struct rcu_head *rcu);
+void sk_psock_drop(struct sock *sk, struct sk_psock *psock);
+
+static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
+{
+	if (refcount_dec_and_test(&psock->refcnt))
+		sk_psock_drop(sk, psock);
+}
+
+static inline void psock_set_prog(struct bpf_prog **pprog,
+				  struct bpf_prog *prog)
+{
+	prog = xchg(pprog, prog);
+	if (prog)
+		bpf_prog_put(prog);
+}
+
+static inline void psock_progs_drop(struct sk_psock_progs *progs)
+{
+	psock_set_prog(&progs->msg_parser, NULL);
+	psock_set_prog(&progs->skb_parser, NULL);
+	psock_set_prog(&progs->skb_verdict, NULL);
+}
+
+#endif /* _LINUX_SKMSG_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8f5cef67fd35..3600ae0f25c3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -858,6 +858,21 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
 }
 
+static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
+}
+
+static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->bpf.sk_redir;
+}
+
+static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 /* This is the variant of inet6_iif() that must be used by TCP,
  * as TCP moves IP6CB into a different location in skb->cb[]
@@ -2064,6 +2079,18 @@ void tcp_cleanup_ulp(struct sock *sk);
 	__MODULE_INFO(alias, alias_userspace, name);		\
 	__MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
 
+struct sk_msg;
+struct sk_psock;
+
+int tcp_bpf_init(struct sock *sk);
+void tcp_bpf_reinit(struct sock *sk);
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
+			  int flags);
+int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+		    int nonblock, int flags, int *addr_len);
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
+		      struct msghdr *msg, int len);
+
 /* Call BPF_SOCK_OPS program that returns an int. If the return value
  * is < 0, then the BPF op failed (for example if the loaded BPF
  * program does not support the chosen operation or there is no BPF
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0488b8258321..ff8262626b8f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -13,11 +13,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y)
 obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
 endif
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
-ifeq ($(CONFIG_STREAM_PARSER),y)
-ifeq ($(CONFIG_INET),y)
-obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
-endif
-endif
 endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3f5bf1af0826..defcf4df6d91 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1792,8 +1792,6 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
-const struct bpf_func_proto bpf_sock_map_update_proto __weak;
-const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
 const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
 const struct bpf_func_proto bpf_get_local_storage_proto __weak;
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
deleted file mode 100644
index de6f7a65c72b..000000000000
--- a/kernel/bpf/sockmap.c
+++ /dev/null
@@ -1,2610 +0,0 @@
-/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-
-/* A BPF sock_map is used to store sock objects. This is primarly used
- * for doing socket redirect with BPF helper routines.
- *
- * A sock map may have BPF programs attached to it, currently a program
- * used to parse packets and a program to provide a verdict and redirect
- * decision on the packet are supported. Any programs attached to a sock
- * map are inherited by sock objects when they are added to the map. If
- * no BPF programs are attached the sock object may only be used for sock
- * redirect.
- *
- * A sock object may be in multiple maps, but can only inherit a single
- * parse or verdict program. If adding a sock object to a map would result
- * in having multiple parsing programs the update will return an EBUSY error.
- *
- * For reference this program is similar to devmap used in XDP context
- * reviewing these together may be useful. For an example please review
- * ./samples/bpf/sockmap/.
- */
-#include <linux/bpf.h>
-#include <net/sock.h>
-#include <linux/filter.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/kernel.h>
-#include <linux/net.h>
-#include <linux/skbuff.h>
-#include <linux/workqueue.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <net/strparser.h>
-#include <net/tcp.h>
-#include <linux/ptr_ring.h>
-#include <net/inet_common.h>
-#include <linux/sched/signal.h>
-
-#define SOCK_CREATE_FLAG_MASK \
-	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
-
-struct bpf_sock_progs {
-	struct bpf_prog *bpf_tx_msg;
-	struct bpf_prog *bpf_parse;
-	struct bpf_prog *bpf_verdict;
-};
-
-struct bpf_stab {
-	struct bpf_map map;
-	struct sock **sock_map;
-	struct bpf_sock_progs progs;
-	raw_spinlock_t lock;
-};
-
-struct bucket {
-	struct hlist_head head;
-	raw_spinlock_t lock;
-};
-
-struct bpf_htab {
-	struct bpf_map map;
-	struct bucket *buckets;
-	atomic_t count;
-	u32 n_buckets;
-	u32 elem_size;
-	struct bpf_sock_progs progs;
-	struct rcu_head rcu;
-};
-
-struct htab_elem {
-	struct rcu_head rcu;
-	struct hlist_node hash_node;
-	u32 hash;
-	struct sock *sk;
-	char key[0];
-};
-
-enum smap_psock_state {
-	SMAP_TX_RUNNING,
-};
-
-struct smap_psock_map_entry {
-	struct list_head list;
-	struct bpf_map *map;
-	struct sock **entry;
-	struct htab_elem __rcu *hash_link;
-};
-
-struct smap_psock {
-	struct rcu_head	rcu;
-	refcount_t refcnt;
-
-	/* datapath variables */
-	struct sk_buff_head rxqueue;
-	bool strp_enabled;
-
-	/* datapath error path cache across tx work invocations */
-	int save_rem;
-	int save_off;
-	struct sk_buff *save_skb;
-
-	/* datapath variables for tx_msg ULP */
-	struct sock *sk_redir;
-	int apply_bytes;
-	int cork_bytes;
-	int sg_size;
-	int eval;
-	struct sk_msg_buff *cork;
-	struct list_head ingress;
-
-	struct strparser strp;
-	struct bpf_prog *bpf_tx_msg;
-	struct bpf_prog *bpf_parse;
-	struct bpf_prog *bpf_verdict;
-	struct list_head maps;
-	spinlock_t maps_lock;
-
-	/* Back reference used when sock callback trigger sockmap operations */
-	struct sock *sock;
-	unsigned long state;
-
-	struct work_struct tx_work;
-	struct work_struct gc_work;
-
-	struct proto *sk_proto;
-	void (*save_unhash)(struct sock *sk);
-	void (*save_close)(struct sock *sk, long timeout);
-	void (*save_data_ready)(struct sock *sk);
-	void (*save_write_space)(struct sock *sk);
-};
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
-			   int nonblock, int flags, int *addr_len);
-static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
-static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
-			    int offset, size_t size, int flags);
-static void bpf_tcp_unhash(struct sock *sk);
-static void bpf_tcp_close(struct sock *sk, long timeout);
-
-static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
-{
-	return rcu_dereference_sk_user_data(sk);
-}
-
-static bool bpf_tcp_stream_read(const struct sock *sk)
-{
-	struct smap_psock *psock;
-	bool empty = true;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock))
-		goto out;
-	empty = list_empty(&psock->ingress);
-out:
-	rcu_read_unlock();
-	return !empty;
-}
-
-enum {
-	SOCKMAP_IPV4,
-	SOCKMAP_IPV6,
-	SOCKMAP_NUM_PROTS,
-};
-
-enum {
-	SOCKMAP_BASE,
-	SOCKMAP_TX,
-	SOCKMAP_NUM_CONFIGS,
-};
-
-static struct proto *saved_tcpv6_prot __read_mostly;
-static DEFINE_SPINLOCK(tcpv6_prot_lock);
-static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
-
-static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
-			 struct proto *base)
-{
-	prot[SOCKMAP_BASE]			= *base;
-	prot[SOCKMAP_BASE].unhash		= bpf_tcp_unhash;
-	prot[SOCKMAP_BASE].close		= bpf_tcp_close;
-	prot[SOCKMAP_BASE].recvmsg		= bpf_tcp_recvmsg;
-	prot[SOCKMAP_BASE].stream_memory_read	= bpf_tcp_stream_read;
-
-	prot[SOCKMAP_TX]			= prot[SOCKMAP_BASE];
-	prot[SOCKMAP_TX].sendmsg		= bpf_tcp_sendmsg;
-	prot[SOCKMAP_TX].sendpage		= bpf_tcp_sendpage;
-}
-
-static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
-{
-	int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
-	int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
-
-	sk->sk_prot = &bpf_tcp_prots[family][conf];
-}
-
-static int bpf_tcp_init(struct sock *sk)
-{
-	struct smap_psock *psock;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		return -EINVAL;
-	}
-
-	if (unlikely(psock->sk_proto)) {
-		rcu_read_unlock();
-		return -EBUSY;
-	}
-
-	psock->save_unhash = sk->sk_prot->unhash;
-	psock->save_close = sk->sk_prot->close;
-	psock->sk_proto = sk->sk_prot;
-
-	/* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
-	if (sk->sk_family == AF_INET6 &&
-	    unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
-		spin_lock_bh(&tcpv6_prot_lock);
-		if (likely(sk->sk_prot != saved_tcpv6_prot)) {
-			build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
-			smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
-		}
-		spin_unlock_bh(&tcpv6_prot_lock);
-	}
-	update_sk_prot(sk, psock);
-	rcu_read_unlock();
-	return 0;
-}
-
-static int __init bpf_sock_init(void)
-{
-	build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot);
-	return 0;
-}
-core_initcall(bpf_sock_init);
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge);
-
-static void bpf_tcp_release(struct sock *sk)
-{
-	struct smap_psock *psock;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock))
-		goto out;
-
-	if (psock->cork) {
-		free_start_sg(psock->sock, psock->cork, true);
-		kfree(psock->cork);
-		psock->cork = NULL;
-	}
-
-	if (psock->sk_proto) {
-		sk->sk_prot = psock->sk_proto;
-		psock->sk_proto = NULL;
-	}
-out:
-	rcu_read_unlock();
-}
-
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
-					 u32 hash, void *key, u32 key_size)
-{
-	struct htab_elem *l;
-
-	hlist_for_each_entry_rcu(l, head, hash_node) {
-		if (l->hash == hash && !memcmp(&l->key, key, key_size))
-			return l;
-	}
-
-	return NULL;
-}
-
-static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
-{
-	return &htab->buckets[hash & (htab->n_buckets - 1)];
-}
-
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
-{
-	return &__select_bucket(htab, hash)->head;
-}
-
-static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
-{
-	atomic_dec(&htab->count);
-	kfree_rcu(l, rcu);
-}
-
-static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
-						  struct smap_psock *psock)
-{
-	struct smap_psock_map_entry *e;
-
-	spin_lock_bh(&psock->maps_lock);
-	e = list_first_entry_or_null(&psock->maps,
-				     struct smap_psock_map_entry,
-				     list);
-	if (e)
-		list_del(&e->list);
-	spin_unlock_bh(&psock->maps_lock);
-	return e;
-}
-
-static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
-{
-	struct smap_psock_map_entry *e;
-	struct sk_msg_buff *md, *mtmp;
-	struct sock *osk;
-
-	if (psock->cork) {
-		free_start_sg(psock->sock, psock->cork, true);
-		kfree(psock->cork);
-		psock->cork = NULL;
-	}
-
-	list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
-		list_del(&md->list);
-		free_start_sg(psock->sock, md, true);
-		kfree(md);
-	}
-
-	e = psock_map_pop(sk, psock);
-	while (e) {
-		if (e->entry) {
-			struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map);
-
-			raw_spin_lock_bh(&stab->lock);
-			osk = *e->entry;
-			if (osk == sk) {
-				*e->entry = NULL;
-				smap_release_sock(psock, sk);
-			}
-			raw_spin_unlock_bh(&stab->lock);
-		} else {
-			struct htab_elem *link = rcu_dereference(e->hash_link);
-			struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map);
-			struct hlist_head *head;
-			struct htab_elem *l;
-			struct bucket *b;
-
-			b = __select_bucket(htab, link->hash);
-			head = &b->head;
-			raw_spin_lock_bh(&b->lock);
-			l = lookup_elem_raw(head,
-					    link->hash, link->key,
-					    htab->map.key_size);
-			/* If another thread deleted this object skip deletion.
-			 * The refcnt on psock may or may not be zero.
-			 */
-			if (l && l == link) {
-				hlist_del_rcu(&link->hash_node);
-				smap_release_sock(psock, link->sk);
-				free_htab_elem(htab, link);
-			}
-			raw_spin_unlock_bh(&b->lock);
-		}
-		kfree(e);
-		e = psock_map_pop(sk, psock);
-	}
-}
-
-static void bpf_tcp_unhash(struct sock *sk)
-{
-	void (*unhash_fun)(struct sock *sk);
-	struct smap_psock *psock;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		if (sk->sk_prot->unhash)
-			sk->sk_prot->unhash(sk);
-		return;
-	}
-	unhash_fun = psock->save_unhash;
-	bpf_tcp_remove(sk, psock);
-	rcu_read_unlock();
-	unhash_fun(sk);
-}
-
-static void bpf_tcp_close(struct sock *sk, long timeout)
-{
-	void (*close_fun)(struct sock *sk, long timeout);
-	struct smap_psock *psock;
-
-	lock_sock(sk);
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		release_sock(sk);
-		return sk->sk_prot->close(sk, timeout);
-	}
-	close_fun = psock->save_close;
-	bpf_tcp_remove(sk, psock);
-	rcu_read_unlock();
-	release_sock(sk);
-	close_fun(sk, timeout);
-}
-
-enum __sk_action {
-	__SK_DROP = 0,
-	__SK_PASS,
-	__SK_REDIRECT,
-	__SK_NONE,
-};
-
-static int memcopy_from_iter(struct sock *sk,
-			     struct sk_msg_buff *md,
-			     struct iov_iter *from, int bytes)
-{
-	struct scatterlist *sg = md->sg_data;
-	int i = md->sg_curr, rc = -ENOSPC;
-
-	do {
-		int copy;
-		char *to;
-
-		if (md->sg_copybreak >= sg[i].length) {
-			md->sg_copybreak = 0;
-
-			if (++i == MAX_SKB_FRAGS)
-				i = 0;
-
-			if (i == md->sg_end)
-				break;
-		}
-
-		copy = sg[i].length - md->sg_copybreak;
-		to = sg_virt(&sg[i]) + md->sg_copybreak;
-		md->sg_copybreak += copy;
-
-		if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
-			rc = copy_from_iter_nocache(to, copy, from);
-		else
-			rc = copy_from_iter(to, copy, from);
-
-		if (rc != copy) {
-			rc = -EFAULT;
-			goto out;
-		}
-
-		bytes -= copy;
-		if (!bytes)
-			break;
-
-		md->sg_copybreak = 0;
-		if (++i == MAX_SKB_FRAGS)
-			i = 0;
-	} while (i != md->sg_end);
-out:
-	md->sg_curr = i;
-	return rc;
-}
-
-static int bpf_tcp_push(struct sock *sk, int apply_bytes,
-			struct sk_msg_buff *md,
-			int flags, bool uncharge)
-{
-	bool apply = apply_bytes;
-	struct scatterlist *sg;
-	int offset, ret = 0;
-	struct page *p;
-	size_t size;
-
-	while (1) {
-		sg = md->sg_data + md->sg_start;
-		size = (apply && apply_bytes < sg->length) ?
-			apply_bytes : sg->length;
-		offset = sg->offset;
-
-		tcp_rate_check_app_limited(sk);
-		p = sg_page(sg);
-retry:
-		ret = do_tcp_sendpages(sk, p, offset, size, flags);
-		if (ret != size) {
-			if (ret > 0) {
-				if (apply)
-					apply_bytes -= ret;
-
-				sg->offset += ret;
-				sg->length -= ret;
-				size -= ret;
-				offset += ret;
-				if (uncharge)
-					sk_mem_uncharge(sk, ret);
-				goto retry;
-			}
-
-			return ret;
-		}
-
-		if (apply)
-			apply_bytes -= ret;
-		sg->offset += ret;
-		sg->length -= ret;
-		if (uncharge)
-			sk_mem_uncharge(sk, ret);
-
-		if (!sg->length) {
-			put_page(p);
-			md->sg_start++;
-			if (md->sg_start == MAX_SKB_FRAGS)
-				md->sg_start = 0;
-			sg_init_table(sg, 1);
-
-			if (md->sg_start == md->sg_end)
-				break;
-		}
-
-		if (apply && !apply_bytes)
-			break;
-	}
-	return 0;
-}
-
-static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md)
-{
-	struct scatterlist *sg = md->sg_data + md->sg_start;
-
-	if (md->sg_copy[md->sg_start]) {
-		md->data = md->data_end = 0;
-	} else {
-		md->data = sg_virt(sg);
-		md->data_end = md->data + sg->length;
-	}
-}
-
-static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
-{
-	struct scatterlist *sg = md->sg_data;
-	int i = md->sg_start;
-
-	do {
-		int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length;
-
-		sk_mem_uncharge(sk, uncharge);
-		bytes -= uncharge;
-		if (!bytes)
-			break;
-		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-	} while (i != md->sg_end);
-}
-
-static void free_bytes_sg(struct sock *sk, int bytes,
-			  struct sk_msg_buff *md, bool charge)
-{
-	struct scatterlist *sg = md->sg_data;
-	int i = md->sg_start, free;
-
-	while (bytes && sg[i].length) {
-		free = sg[i].length;
-		if (bytes < free) {
-			sg[i].length -= bytes;
-			sg[i].offset += bytes;
-			if (charge)
-				sk_mem_uncharge(sk, bytes);
-			break;
-		}
-
-		if (charge)
-			sk_mem_uncharge(sk, sg[i].length);
-		put_page(sg_page(&sg[i]));
-		bytes -= sg[i].length;
-		sg[i].length = 0;
-		sg[i].page_link = 0;
-		sg[i].offset = 0;
-		i++;
-
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-	}
-	md->sg_start = i;
-}
-
-static int free_sg(struct sock *sk, int start,
-		   struct sk_msg_buff *md, bool charge)
-{
-	struct scatterlist *sg = md->sg_data;
-	int i = start, free = 0;
-
-	while (sg[i].length) {
-		free += sg[i].length;
-		if (charge)
-			sk_mem_uncharge(sk, sg[i].length);
-		if (!md->skb)
-			put_page(sg_page(&sg[i]));
-		sg[i].length = 0;
-		sg[i].page_link = 0;
-		sg[i].offset = 0;
-		i++;
-
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-	}
-	consume_skb(md->skb);
-
-	return free;
-}
-
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge)
-{
-	int free = free_sg(sk, md->sg_start, md, charge);
-
-	md->sg_start = md->sg_end;
-	return free;
-}
-
-static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
-{
-	return free_sg(sk, md->sg_curr, md, true);
-}
-
-static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
-{
-	return ((_rc == SK_PASS) ?
-	       (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
-	       __SK_DROP);
-}
-
-static unsigned int smap_do_tx_msg(struct sock *sk,
-				   struct smap_psock *psock,
-				   struct sk_msg_buff *md)
-{
-	struct bpf_prog *prog;
-	unsigned int rc, _rc;
-
-	preempt_disable();
-	rcu_read_lock();
-
-	/* If the policy was removed mid-send then default to 'accept' */
-	prog = READ_ONCE(psock->bpf_tx_msg);
-	if (unlikely(!prog)) {
-		_rc = SK_PASS;
-		goto verdict;
-	}
-
-	bpf_compute_data_pointers_sg(md);
-	md->sk = sk;
-	rc = (*prog->bpf_func)(md, prog->insnsi);
-	psock->apply_bytes = md->apply_bytes;
-
-	/* Moving return codes from UAPI namespace into internal namespace */
-	_rc = bpf_map_msg_verdict(rc, md);
-
-	/* The psock has a refcount on the sock but not on the map and because
-	 * we need to drop rcu read lock here its possible the map could be
-	 * removed between here and when we need it to execute the sock
-	 * redirect. So do the map lookup now for future use.
-	 */
-	if (_rc == __SK_REDIRECT) {
-		if (psock->sk_redir)
-			sock_put(psock->sk_redir);
-		psock->sk_redir = do_msg_redirect_map(md);
-		if (!psock->sk_redir) {
-			_rc = __SK_DROP;
-			goto verdict;
-		}
-		sock_hold(psock->sk_redir);
-	}
-verdict:
-	rcu_read_unlock();
-	preempt_enable();
-
-	return _rc;
-}
-
-static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
-			   struct smap_psock *psock,
-			   struct sk_msg_buff *md, int flags)
-{
-	bool apply = apply_bytes;
-	size_t size, copied = 0;
-	struct sk_msg_buff *r;
-	int err = 0, i;
-
-	r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL);
-	if (unlikely(!r))
-		return -ENOMEM;
-
-	lock_sock(sk);
-	r->sg_start = md->sg_start;
-	i = md->sg_start;
-
-	do {
-		size = (apply && apply_bytes < md->sg_data[i].length) ?
-			apply_bytes : md->sg_data[i].length;
-
-		if (!sk_wmem_schedule(sk, size)) {
-			if (!copied)
-				err = -ENOMEM;
-			break;
-		}
-
-		sk_mem_charge(sk, size);
-		r->sg_data[i] = md->sg_data[i];
-		r->sg_data[i].length = size;
-		md->sg_data[i].length -= size;
-		md->sg_data[i].offset += size;
-		copied += size;
-
-		if (md->sg_data[i].length) {
-			get_page(sg_page(&r->sg_data[i]));
-			r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1;
-		} else {
-			i++;
-			if (i == MAX_SKB_FRAGS)
-				i = 0;
-			r->sg_end = i;
-		}
-
-		if (apply) {
-			apply_bytes -= size;
-			if (!apply_bytes)
-				break;
-		}
-	} while (i != md->sg_end);
-
-	md->sg_start = i;
-
-	if (!err) {
-		list_add_tail(&r->list, &psock->ingress);
-		sk->sk_data_ready(sk);
-	} else {
-		free_start_sg(sk, r, true);
-		kfree(r);
-	}
-
-	release_sock(sk);
-	return err;
-}
-
-static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
-				       struct sk_msg_buff *md,
-				       int flags)
-{
-	bool ingress = !!(md->flags & BPF_F_INGRESS);
-	struct smap_psock *psock;
-	int err = 0;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock))
-		goto out_rcu;
-
-	if (!refcount_inc_not_zero(&psock->refcnt))
-		goto out_rcu;
-
-	rcu_read_unlock();
-
-	if (ingress) {
-		err = bpf_tcp_ingress(sk, send, psock, md, flags);
-	} else {
-		lock_sock(sk);
-		err = bpf_tcp_push(sk, send, md, flags, false);
-		release_sock(sk);
-	}
-	smap_release_sock(psock, sk);
-	return err;
-out_rcu:
-	rcu_read_unlock();
-	return 0;
-}
-
-static inline void bpf_md_init(struct smap_psock *psock)
-{
-	if (!psock->apply_bytes) {
-		psock->eval =  __SK_NONE;
-		if (psock->sk_redir) {
-			sock_put(psock->sk_redir);
-			psock->sk_redir = NULL;
-		}
-	}
-}
-
-static void apply_bytes_dec(struct smap_psock *psock, int i)
-{
-	if (psock->apply_bytes) {
-		if (psock->apply_bytes < i)
-			psock->apply_bytes = 0;
-		else
-			psock->apply_bytes -= i;
-	}
-}
-
-static int bpf_exec_tx_verdict(struct smap_psock *psock,
-			       struct sk_msg_buff *m,
-			       struct sock *sk,
-			       int *copied, int flags)
-{
-	bool cork = false, enospc = (m->sg_start == m->sg_end);
-	struct sock *redir;
-	int err = 0;
-	int send;
-
-more_data:
-	if (psock->eval == __SK_NONE)
-		psock->eval = smap_do_tx_msg(sk, psock, m);
-
-	if (m->cork_bytes &&
-	    m->cork_bytes > psock->sg_size && !enospc) {
-		psock->cork_bytes = m->cork_bytes - psock->sg_size;
-		if (!psock->cork) {
-			psock->cork = kcalloc(1,
-					sizeof(struct sk_msg_buff),
-					GFP_ATOMIC | __GFP_NOWARN);
-
-			if (!psock->cork) {
-				err = -ENOMEM;
-				goto out_err;
-			}
-		}
-		memcpy(psock->cork, m, sizeof(*m));
-		goto out_err;
-	}
-
-	send = psock->sg_size;
-	if (psock->apply_bytes && psock->apply_bytes < send)
-		send = psock->apply_bytes;
-
-	switch (psock->eval) {
-	case __SK_PASS:
-		err = bpf_tcp_push(sk, send, m, flags, true);
-		if (unlikely(err)) {
-			*copied -= free_start_sg(sk, m, true);
-			break;
-		}
-
-		apply_bytes_dec(psock, send);
-		psock->sg_size -= send;
-		break;
-	case __SK_REDIRECT:
-		redir = psock->sk_redir;
-		apply_bytes_dec(psock, send);
-
-		if (psock->cork) {
-			cork = true;
-			psock->cork = NULL;
-		}
-
-		return_mem_sg(sk, send, m);
-		release_sock(sk);
-
-		err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
-		lock_sock(sk);
-
-		if (unlikely(err < 0)) {
-			int free = free_start_sg(sk, m, false);
-
-			psock->sg_size = 0;
-			if (!cork)
-				*copied -= free;
-		} else {
-			psock->sg_size -= send;
-		}
-
-		if (cork) {
-			free_start_sg(sk, m, true);
-			psock->sg_size = 0;
-			kfree(m);
-			m = NULL;
-			err = 0;
-		}
-		break;
-	case __SK_DROP:
-	default:
-		free_bytes_sg(sk, send, m, true);
-		apply_bytes_dec(psock, send);
-		*copied -= send;
-		psock->sg_size -= send;
-		err = -EACCES;
-		break;
-	}
-
-	if (likely(!err)) {
-		bpf_md_init(psock);
-		if (m &&
-		    m->sg_data[m->sg_start].page_link &&
-		    m->sg_data[m->sg_start].length)
-			goto more_data;
-	}
-
-out_err:
-	return err;
-}
-
-static int bpf_wait_data(struct sock *sk,
-			 struct smap_psock *psk, int flags,
-			 long timeo, int *err)
-{
-	int rc;
-
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-	rc = sk_wait_event(sk, &timeo,
-			   !list_empty(&psk->ingress) ||
-			   !skb_queue_empty(&sk->sk_receive_queue),
-			   &wait);
-	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-	remove_wait_queue(sk_sleep(sk), &wait);
-
-	return rc;
-}
-
-static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
-			   int nonblock, int flags, int *addr_len)
-{
-	struct iov_iter *iter = &msg->msg_iter;
-	struct smap_psock *psock;
-	int copied = 0;
-
-	if (unlikely(flags & MSG_ERRQUEUE))
-		return inet_recv_error(sk, msg, len, addr_len);
-	if (!skb_queue_empty(&sk->sk_receive_queue))
-		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock))
-		goto out;
-
-	if (unlikely(!refcount_inc_not_zero(&psock->refcnt)))
-		goto out;
-	rcu_read_unlock();
-
-	lock_sock(sk);
-bytes_ready:
-	while (copied != len) {
-		struct scatterlist *sg;
-		struct sk_msg_buff *md;
-		int i;
-
-		md = list_first_entry_or_null(&psock->ingress,
-					      struct sk_msg_buff, list);
-		if (unlikely(!md))
-			break;
-		i = md->sg_start;
-		do {
-			struct page *page;
-			int n, copy;
-
-			sg = &md->sg_data[i];
-			copy = sg->length;
-			page = sg_page(sg);
-
-			if (copied + copy > len)
-				copy = len - copied;
-
-			n = copy_page_to_iter(page, sg->offset, copy, iter);
-			if (n != copy) {
-				md->sg_start = i;
-				release_sock(sk);
-				smap_release_sock(psock, sk);
-				return -EFAULT;
-			}
-
-			copied += copy;
-			sg->offset += copy;
-			sg->length -= copy;
-			sk_mem_uncharge(sk, copy);
-
-			if (!sg->length) {
-				i++;
-				if (i == MAX_SKB_FRAGS)
-					i = 0;
-				if (!md->skb)
-					put_page(page);
-			}
-			if (copied == len)
-				break;
-		} while (i != md->sg_end);
-		md->sg_start = i;
-
-		if (!sg->length && md->sg_start == md->sg_end) {
-			list_del(&md->list);
-			consume_skb(md->skb);
-			kfree(md);
-		}
-	}
-
-	if (!copied) {
-		long timeo;
-		int data;
-		int err = 0;
-
-		timeo = sock_rcvtimeo(sk, nonblock);
-		data = bpf_wait_data(sk, psock, flags, timeo, &err);
-
-		if (data) {
-			if (!skb_queue_empty(&sk->sk_receive_queue)) {
-				release_sock(sk);
-				smap_release_sock(psock, sk);
-				copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-				return copied;
-			}
-			goto bytes_ready;
-		}
-
-		if (err)
-			copied = err;
-	}
-
-	release_sock(sk);
-	smap_release_sock(psock, sk);
-	return copied;
-out:
-	rcu_read_unlock();
-	return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-}
-
-
-static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
-{
-	int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
-	struct sk_msg_buff md = {0};
-	unsigned int sg_copy = 0;
-	struct smap_psock *psock;
-	int copied = 0, err = 0;
-	struct scatterlist *sg;
-	long timeo;
-
-	/* Its possible a sock event or user removed the psock _but_ the ops
-	 * have not been reprogrammed yet so we get here. In this case fallback
-	 * to tcp_sendmsg. Note this only works because we _only_ ever allow
-	 * a single ULP there is no hierarchy here.
-	 */
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		return tcp_sendmsg(sk, msg, size);
-	}
-
-	/* Increment the psock refcnt to ensure its not released while sending a
-	 * message. Required because sk lookup and bpf programs are used in
-	 * separate rcu critical sections. Its OK if we lose the map entry
-	 * but we can't lose the sock reference.
-	 */
-	if (!refcount_inc_not_zero(&psock->refcnt)) {
-		rcu_read_unlock();
-		return tcp_sendmsg(sk, msg, size);
-	}
-
-	sg = md.sg_data;
-	sg_init_marker(sg, MAX_SKB_FRAGS);
-	rcu_read_unlock();
-
-	lock_sock(sk);
-	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
-
-	while (msg_data_left(msg)) {
-		struct sk_msg_buff *m = NULL;
-		bool enospc = false;
-		int copy;
-
-		if (sk->sk_err) {
-			err = -sk->sk_err;
-			goto out_err;
-		}
-
-		copy = msg_data_left(msg);
-		if (!sk_stream_memory_free(sk))
-			goto wait_for_sndbuf;
-
-		m = psock->cork_bytes ? psock->cork : &md;
-		m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end;
-		err = sk_alloc_sg(sk, copy, m->sg_data,
-				  m->sg_start, &m->sg_end, &sg_copy,
-				  m->sg_end - 1);
-		if (err) {
-			if (err != -ENOSPC)
-				goto wait_for_memory;
-			enospc = true;
-			copy = sg_copy;
-		}
-
-		err = memcopy_from_iter(sk, m, &msg->msg_iter, copy);
-		if (err < 0) {
-			free_curr_sg(sk, m);
-			goto out_err;
-		}
-
-		psock->sg_size += copy;
-		copied += copy;
-		sg_copy = 0;
-
-		/* When bytes are being corked skip running BPF program and
-		 * applying verdict unless there is no more buffer space. In
-		 * the ENOSPC case simply run BPF prorgram with currently
-		 * accumulated data. We don't have much choice at this point
-		 * we could try extending the page frags or chaining complex
-		 * frags but even in these cases _eventually_ we will hit an
-		 * OOM scenario. More complex recovery schemes may be
-		 * implemented in the future, but BPF programs must handle
-		 * the case where apply_cork requests are not honored. The
-		 * canonical method to verify this is to check data length.
-		 */
-		if (psock->cork_bytes) {
-			if (copy > psock->cork_bytes)
-				psock->cork_bytes = 0;
-			else
-				psock->cork_bytes -= copy;
-
-			if (psock->cork_bytes && !enospc)
-				goto out_cork;
-
-			/* All cork bytes accounted for re-run filter */
-			psock->eval = __SK_NONE;
-			psock->cork_bytes = 0;
-		}
-
-		err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
-		if (unlikely(err < 0))
-			goto out_err;
-		continue;
-wait_for_sndbuf:
-		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
-		err = sk_stream_wait_memory(sk, &timeo);
-		if (err) {
-			if (m && m != psock->cork)
-				free_start_sg(sk, m, true);
-			goto out_err;
-		}
-	}
-out_err:
-	if (err < 0)
-		err = sk_stream_error(sk, msg->msg_flags, err);
-out_cork:
-	release_sock(sk);
-	smap_release_sock(psock, sk);
-	return copied ? copied : err;
-}
-
-static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
-			    int offset, size_t size, int flags)
-{
-	struct sk_msg_buff md = {0}, *m = NULL;
-	int err = 0, copied = 0;
-	struct smap_psock *psock;
-	struct scatterlist *sg;
-	bool enospc = false;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock))
-		goto accept;
-
-	if (!refcount_inc_not_zero(&psock->refcnt))
-		goto accept;
-	rcu_read_unlock();
-
-	lock_sock(sk);
-
-	if (psock->cork_bytes) {
-		m = psock->cork;
-		sg = &m->sg_data[m->sg_end];
-	} else {
-		m = &md;
-		sg = m->sg_data;
-		sg_init_marker(sg, MAX_SKB_FRAGS);
-	}
-
-	/* Catch case where ring is full and sendpage is stalled. */
-	if (unlikely(m->sg_end == m->sg_start &&
-	    m->sg_data[m->sg_end].length))
-		goto out_err;
-
-	psock->sg_size += size;
-	sg_set_page(sg, page, size, offset);
-	get_page(page);
-	m->sg_copy[m->sg_end] = true;
-	sk_mem_charge(sk, size);
-	m->sg_end++;
-	copied = size;
-
-	if (m->sg_end == MAX_SKB_FRAGS)
-		m->sg_end = 0;
-
-	if (m->sg_end == m->sg_start)
-		enospc = true;
-
-	if (psock->cork_bytes) {
-		if (size > psock->cork_bytes)
-			psock->cork_bytes = 0;
-		else
-			psock->cork_bytes -= size;
-
-		if (psock->cork_bytes && !enospc)
-			goto out_err;
-
-		/* All cork bytes accounted for re-run filter */
-		psock->eval = __SK_NONE;
-		psock->cork_bytes = 0;
-	}
-
-	err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
-out_err:
-	release_sock(sk);
-	smap_release_sock(psock, sk);
-	return copied ? copied : err;
-accept:
-	rcu_read_unlock();
-	return tcp_sendpage(sk, page, offset, size, flags);
-}
-
-static void bpf_tcp_msg_add(struct smap_psock *psock,
-			    struct sock *sk,
-			    struct bpf_prog *tx_msg)
-{
-	struct bpf_prog *orig_tx_msg;
-
-	orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg);
-	if (orig_tx_msg)
-		bpf_prog_put(orig_tx_msg);
-}
-
-static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
-{
-	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
-	int rc;
-
-	if (unlikely(!prog))
-		return __SK_DROP;
-
-	skb_orphan(skb);
-	/* We need to ensure that BPF metadata for maps is also cleared
-	 * when we orphan the skb so that we don't have the possibility
-	 * to reference a stale map.
-	 */
-	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
-	skb->sk = psock->sock;
-	bpf_compute_data_end_sk_skb(skb);
-	preempt_disable();
-	rc = (*prog->bpf_func)(skb, prog->insnsi);
-	preempt_enable();
-	skb->sk = NULL;
-
-	/* Moving return codes from UAPI namespace into internal namespace */
-	return rc == SK_PASS ?
-		(TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
-		__SK_DROP;
-}
-
-static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb)
-{
-	struct sock *sk = psock->sock;
-	int copied = 0, num_sg;
-	struct sk_msg_buff *r;
-
-	r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC);
-	if (unlikely(!r))
-		return -EAGAIN;
-
-	if (!sk_rmem_schedule(sk, skb, skb->len)) {
-		kfree(r);
-		return -EAGAIN;
-	}
-
-	sg_init_table(r->sg_data, MAX_SKB_FRAGS);
-	num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len);
-	if (unlikely(num_sg < 0)) {
-		kfree(r);
-		return num_sg;
-	}
-	sk_mem_charge(sk, skb->len);
-	copied = skb->len;
-	r->sg_start = 0;
-	r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg;
-	r->skb = skb;
-	list_add_tail(&r->list, &psock->ingress);
-	sk->sk_data_ready(sk);
-	return copied;
-}
-
-static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
-{
-	struct smap_psock *peer;
-	struct sock *sk;
-	__u32 in;
-	int rc;
-
-	rc = smap_verdict_func(psock, skb);
-	switch (rc) {
-	case __SK_REDIRECT:
-		sk = do_sk_redirect_map(skb);
-		if (!sk) {
-			kfree_skb(skb);
-			break;
-		}
-
-		peer = smap_psock_sk(sk);
-		in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
-
-		if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) ||
-			     !test_bit(SMAP_TX_RUNNING, &peer->state))) {
-			kfree_skb(skb);
-			break;
-		}
-
-		if (!in && sock_writeable(sk)) {
-			skb_set_owner_w(skb, sk);
-			skb_queue_tail(&peer->rxqueue, skb);
-			schedule_work(&peer->tx_work);
-			break;
-		} else if (in &&
-			   atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
-			skb_queue_tail(&peer->rxqueue, skb);
-			schedule_work(&peer->tx_work);
-			break;
-		}
-	/* Fall through and free skb otherwise */
-	case __SK_DROP:
-	default:
-		kfree_skb(skb);
-	}
-}
-
-static void smap_report_sk_error(struct smap_psock *psock, int err)
-{
-	struct sock *sk = psock->sock;
-
-	sk->sk_err = err;
-	sk->sk_error_report(sk);
-}
-
-static void smap_read_sock_strparser(struct strparser *strp,
-				     struct sk_buff *skb)
-{
-	struct smap_psock *psock;
-
-	rcu_read_lock();
-	psock = container_of(strp, struct smap_psock, strp);
-	smap_do_verdict(psock, skb);
-	rcu_read_unlock();
-}
-
-/* Called with lock held on socket */
-static void smap_data_ready(struct sock *sk)
-{
-	struct smap_psock *psock;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (likely(psock)) {
-		write_lock_bh(&sk->sk_callback_lock);
-		strp_data_ready(&psock->strp);
-		write_unlock_bh(&sk->sk_callback_lock);
-	}
-	rcu_read_unlock();
-}
-
-static void smap_tx_work(struct work_struct *w)
-{
-	struct smap_psock *psock;
-	struct sk_buff *skb;
-	int rem, off, n;
-
-	psock = container_of(w, struct smap_psock, tx_work);
-
-	/* lock sock to avoid losing sk_socket at some point during loop */
-	lock_sock(psock->sock);
-	if (psock->save_skb) {
-		skb = psock->save_skb;
-		rem = psock->save_rem;
-		off = psock->save_off;
-		psock->save_skb = NULL;
-		goto start;
-	}
-
-	while ((skb = skb_dequeue(&psock->rxqueue))) {
-		__u32 flags;
-
-		rem = skb->len;
-		off = 0;
-start:
-		flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
-		do {
-			if (likely(psock->sock->sk_socket)) {
-				if (flags)
-					n = smap_do_ingress(psock, skb);
-				else
-					n = skb_send_sock_locked(psock->sock,
-								 skb, off, rem);
-			} else {
-				n = -EINVAL;
-			}
-
-			if (n <= 0) {
-				if (n == -EAGAIN) {
-					/* Retry when space is available */
-					psock->save_skb = skb;
-					psock->save_rem = rem;
-					psock->save_off = off;
-					goto out;
-				}
-				/* Hard errors break pipe and stop xmit */
-				smap_report_sk_error(psock, n ? -n : EPIPE);
-				clear_bit(SMAP_TX_RUNNING, &psock->state);
-				kfree_skb(skb);
-				goto out;
-			}
-			rem -= n;
-			off += n;
-		} while (rem);
-
-		if (!flags)
-			kfree_skb(skb);
-	}
-out:
-	release_sock(psock->sock);
-}
-
-static void smap_write_space(struct sock *sk)
-{
-	struct smap_psock *psock;
-	void (*write_space)(struct sock *sk);
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
-		schedule_work(&psock->tx_work);
-	write_space = psock->save_write_space;
-	rcu_read_unlock();
-	write_space(sk);
-}
-
-static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
-{
-	if (!psock->strp_enabled)
-		return;
-	sk->sk_data_ready = psock->save_data_ready;
-	sk->sk_write_space = psock->save_write_space;
-	psock->save_data_ready = NULL;
-	psock->save_write_space = NULL;
-	strp_stop(&psock->strp);
-	psock->strp_enabled = false;
-}
-
-static void smap_destroy_psock(struct rcu_head *rcu)
-{
-	struct smap_psock *psock = container_of(rcu,
-						  struct smap_psock, rcu);
-
-	/* Now that a grace period has passed there is no longer
-	 * any reference to this sock in the sockmap so we can
-	 * destroy the psock, strparser, and bpf programs. But,
-	 * because we use workqueue sync operations we can not
-	 * do it in rcu context
-	 */
-	schedule_work(&psock->gc_work);
-}
-
-static bool psock_is_smap_sk(struct sock *sk)
-{
-	return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops;
-}
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
-{
-	if (refcount_dec_and_test(&psock->refcnt)) {
-		if (psock_is_smap_sk(sock))
-			bpf_tcp_release(sock);
-		write_lock_bh(&sock->sk_callback_lock);
-		smap_stop_sock(psock, sock);
-		write_unlock_bh(&sock->sk_callback_lock);
-		clear_bit(SMAP_TX_RUNNING, &psock->state);
-		rcu_assign_sk_user_data(sock, NULL);
-		call_rcu_sched(&psock->rcu, smap_destroy_psock);
-	}
-}
-
-static int smap_parse_func_strparser(struct strparser *strp,
-				       struct sk_buff *skb)
-{
-	struct smap_psock *psock;
-	struct bpf_prog *prog;
-	int rc;
-
-	rcu_read_lock();
-	psock = container_of(strp, struct smap_psock, strp);
-	prog = READ_ONCE(psock->bpf_parse);
-
-	if (unlikely(!prog)) {
-		rcu_read_unlock();
-		return skb->len;
-	}
-
-	/* Attach socket for bpf program to use if needed we can do this
-	 * because strparser clones the skb before handing it to a upper
-	 * layer, meaning skb_orphan has been called. We NULL sk on the
-	 * way out to ensure we don't trigger a BUG_ON in skb/sk operations
-	 * later and because we are not charging the memory of this skb to
-	 * any socket yet.
-	 */
-	skb->sk = psock->sock;
-	bpf_compute_data_end_sk_skb(skb);
-	rc = (*prog->bpf_func)(skb, prog->insnsi);
-	skb->sk = NULL;
-	rcu_read_unlock();
-	return rc;
-}
-
-static int smap_read_sock_done(struct strparser *strp, int err)
-{
-	return err;
-}
-
-static int smap_init_sock(struct smap_psock *psock,
-			  struct sock *sk)
-{
-	static const struct strp_callbacks cb = {
-		.rcv_msg = smap_read_sock_strparser,
-		.parse_msg = smap_parse_func_strparser,
-		.read_sock_done = smap_read_sock_done,
-	};
-
-	return strp_init(&psock->strp, sk, &cb);
-}
-
-static void smap_init_progs(struct smap_psock *psock,
-			    struct bpf_prog *verdict,
-			    struct bpf_prog *parse)
-{
-	struct bpf_prog *orig_parse, *orig_verdict;
-
-	orig_parse = xchg(&psock->bpf_parse, parse);
-	orig_verdict = xchg(&psock->bpf_verdict, verdict);
-
-	if (orig_verdict)
-		bpf_prog_put(orig_verdict);
-	if (orig_parse)
-		bpf_prog_put(orig_parse);
-}
-
-static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
-{
-	if (sk->sk_data_ready == smap_data_ready)
-		return;
-	psock->save_data_ready = sk->sk_data_ready;
-	psock->save_write_space = sk->sk_write_space;
-	sk->sk_data_ready = smap_data_ready;
-	sk->sk_write_space = smap_write_space;
-	psock->strp_enabled = true;
-}
-
-static void sock_map_remove_complete(struct bpf_stab *stab)
-{
-	bpf_map_area_free(stab->sock_map);
-	kfree(stab);
-}
-
-static void smap_gc_work(struct work_struct *w)
-{
-	struct smap_psock_map_entry *e, *tmp;
-	struct sk_msg_buff *md, *mtmp;
-	struct smap_psock *psock;
-
-	psock = container_of(w, struct smap_psock, gc_work);
-
-	/* no callback lock needed because we already detached sockmap ops */
-	if (psock->strp_enabled)
-		strp_done(&psock->strp);
-
-	cancel_work_sync(&psock->tx_work);
-	__skb_queue_purge(&psock->rxqueue);
-
-	/* At this point all strparser and xmit work must be complete */
-	if (psock->bpf_parse)
-		bpf_prog_put(psock->bpf_parse);
-	if (psock->bpf_verdict)
-		bpf_prog_put(psock->bpf_verdict);
-	if (psock->bpf_tx_msg)
-		bpf_prog_put(psock->bpf_tx_msg);
-
-	if (psock->cork) {
-		free_start_sg(psock->sock, psock->cork, true);
-		kfree(psock->cork);
-	}
-
-	list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
-		list_del(&md->list);
-		free_start_sg(psock->sock, md, true);
-		kfree(md);
-	}
-
-	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		list_del(&e->list);
-		kfree(e);
-	}
-
-	if (psock->sk_redir)
-		sock_put(psock->sk_redir);
-
-	sock_put(psock->sock);
-	kfree(psock);
-}
-
-static struct smap_psock *smap_init_psock(struct sock *sock, int node)
-{
-	struct smap_psock *psock;
-
-	psock = kzalloc_node(sizeof(struct smap_psock),
-			     GFP_ATOMIC | __GFP_NOWARN,
-			     node);
-	if (!psock)
-		return ERR_PTR(-ENOMEM);
-
-	psock->eval =  __SK_NONE;
-	psock->sock = sock;
-	skb_queue_head_init(&psock->rxqueue);
-	INIT_WORK(&psock->tx_work, smap_tx_work);
-	INIT_WORK(&psock->gc_work, smap_gc_work);
-	INIT_LIST_HEAD(&psock->maps);
-	INIT_LIST_HEAD(&psock->ingress);
-	refcount_set(&psock->refcnt, 1);
-	spin_lock_init(&psock->maps_lock);
-
-	rcu_assign_sk_user_data(sock, psock);
-	sock_hold(sock);
-	return psock;
-}
-
-static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
-{
-	struct bpf_stab *stab;
-	u64 cost;
-	int err;
-
-	if (!capable(CAP_NET_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	/* check sanity of attributes */
-	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
-		return ERR_PTR(-EINVAL);
-
-	stab = kzalloc(sizeof(*stab), GFP_USER);
-	if (!stab)
-		return ERR_PTR(-ENOMEM);
-
-	bpf_map_init_from_attr(&stab->map, attr);
-	raw_spin_lock_init(&stab->lock);
-
-	/* make sure page count doesn't overflow */
-	cost = (u64) stab->map.max_entries * sizeof(struct sock *);
-	err = -EINVAL;
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_stab;
-
-	stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(stab->map.pages);
-	if (err)
-		goto free_stab;
-
-	err = -ENOMEM;
-	stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
-					    sizeof(struct sock *),
-					    stab->map.numa_node);
-	if (!stab->sock_map)
-		goto free_stab;
-
-	return &stab->map;
-free_stab:
-	kfree(stab);
-	return ERR_PTR(err);
-}
-
-static void smap_list_map_remove(struct smap_psock *psock,
-				 struct sock **entry)
-{
-	struct smap_psock_map_entry *e, *tmp;
-
-	spin_lock_bh(&psock->maps_lock);
-	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		if (e->entry == entry) {
-			list_del(&e->list);
-			kfree(e);
-		}
-	}
-	spin_unlock_bh(&psock->maps_lock);
-}
-
-static void smap_list_hash_remove(struct smap_psock *psock,
-				  struct htab_elem *hash_link)
-{
-	struct smap_psock_map_entry *e, *tmp;
-
-	spin_lock_bh(&psock->maps_lock);
-	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		struct htab_elem *c = rcu_dereference(e->hash_link);
-
-		if (c == hash_link) {
-			list_del(&e->list);
-			kfree(e);
-		}
-	}
-	spin_unlock_bh(&psock->maps_lock);
-}
-
-static void sock_map_free(struct bpf_map *map)
-{
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	int i;
-
-	synchronize_rcu();
-
-	/* At this point no update, lookup or delete operations can happen.
-	 * However, be aware we can still get a socket state event updates,
-	 * and data ready callabacks that reference the psock from sk_user_data
-	 * Also psock worker threads are still in-flight. So smap_release_sock
-	 * will only free the psock after cancel_sync on the worker threads
-	 * and a grace period expire to ensure psock is really safe to remove.
-	 */
-	rcu_read_lock();
-	raw_spin_lock_bh(&stab->lock);
-	for (i = 0; i < stab->map.max_entries; i++) {
-		struct smap_psock *psock;
-		struct sock *sock;
-
-		sock = stab->sock_map[i];
-		if (!sock)
-			continue;
-		stab->sock_map[i] = NULL;
-		psock = smap_psock_sk(sock);
-		/* This check handles a racing sock event that can get the
-		 * sk_callback_lock before this case but after xchg happens
-		 * causing the refcnt to hit zero and sock user data (psock)
-		 * to be null and queued for garbage collection.
-		 */
-		if (likely(psock)) {
-			smap_list_map_remove(psock, &stab->sock_map[i]);
-			smap_release_sock(psock, sock);
-		}
-	}
-	raw_spin_unlock_bh(&stab->lock);
-	rcu_read_unlock();
-
-	sock_map_remove_complete(stab);
-}
-
-static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
-{
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	u32 i = key ? *(u32 *)key : U32_MAX;
-	u32 *next = (u32 *)next_key;
-
-	if (i >= stab->map.max_entries) {
-		*next = 0;
-		return 0;
-	}
-
-	if (i == stab->map.max_entries - 1)
-		return -ENOENT;
-
-	*next = i + 1;
-	return 0;
-}
-
-struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
-{
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
-	if (key >= map->max_entries)
-		return NULL;
-
-	return READ_ONCE(stab->sock_map[key]);
-}
-
-static int sock_map_delete_elem(struct bpf_map *map, void *key)
-{
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	struct smap_psock *psock;
-	int k = *(u32 *)key;
-	struct sock *sock;
-
-	if (k >= map->max_entries)
-		return -EINVAL;
-
-	raw_spin_lock_bh(&stab->lock);
-	sock = stab->sock_map[k];
-	stab->sock_map[k] = NULL;
-	raw_spin_unlock_bh(&stab->lock);
-	if (!sock)
-		return -EINVAL;
-
-	psock = smap_psock_sk(sock);
-	if (!psock)
-		return 0;
-	if (psock->bpf_parse) {
-		write_lock_bh(&sock->sk_callback_lock);
-		smap_stop_sock(psock, sock);
-		write_unlock_bh(&sock->sk_callback_lock);
-	}
-	smap_list_map_remove(psock, &stab->sock_map[k]);
-	smap_release_sock(psock, sock);
-	return 0;
-}
-
-/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
- * done inside rcu critical sections. This ensures on updates that the psock
- * will not be released via smap_release_sock() until concurrent updates/deletes
- * complete. All operations operate on sock_map using cmpxchg and xchg
- * operations to ensure we do not get stale references. Any reads into the
- * map must be done with READ_ONCE() because of this.
- *
- * A psock is destroyed via call_rcu and after any worker threads are cancelled
- * and syncd so we are certain all references from the update/lookup/delete
- * operations as well as references in the data path are no longer in use.
- *
- * Psocks may exist in multiple maps, but only a single set of parse/verdict
- * programs may be inherited from the maps it belongs to. A reference count
- * is kept with the total number of references to the psock from all maps. The
- * psock will not be released until this reaches zero. The psock and sock
- * user data data use the sk_callback_lock to protect critical data structures
- * from concurrent access. This allows us to avoid two updates from modifying
- * the user data in sock and the lock is required anyways for modifying
- * callbacks, we simply increase its scope slightly.
- *
- * Rules to follow,
- *  - psock must always be read inside RCU critical section
- *  - sk_user_data must only be modified inside sk_callback_lock and read
- *    inside RCU critical section.
- *  - psock->maps list must only be read & modified inside sk_callback_lock
- *  - sock_map must use READ_ONCE and (cmp)xchg operations
- *  - BPF verdict/parse programs must use READ_ONCE and xchg operations
- */
-
-static int __sock_map_ctx_update_elem(struct bpf_map *map,
-				      struct bpf_sock_progs *progs,
-				      struct sock *sock,
-				      void *key)
-{
-	struct bpf_prog *verdict, *parse, *tx_msg;
-	struct smap_psock *psock;
-	bool new = false;
-	int err = 0;
-
-	/* 1. If sock map has BPF programs those will be inherited by the
-	 * sock being added. If the sock is already attached to BPF programs
-	 * this results in an error.
-	 */
-	verdict = READ_ONCE(progs->bpf_verdict);
-	parse = READ_ONCE(progs->bpf_parse);
-	tx_msg = READ_ONCE(progs->bpf_tx_msg);
-
-	if (parse && verdict) {
-		/* bpf prog refcnt may be zero if a concurrent attach operation
-		 * removes the program after the above READ_ONCE() but before
-		 * we increment the refcnt. If this is the case abort with an
-		 * error.
-		 */
-		verdict = bpf_prog_inc_not_zero(verdict);
-		if (IS_ERR(verdict))
-			return PTR_ERR(verdict);
-
-		parse = bpf_prog_inc_not_zero(parse);
-		if (IS_ERR(parse)) {
-			bpf_prog_put(verdict);
-			return PTR_ERR(parse);
-		}
-	}
-
-	if (tx_msg) {
-		tx_msg = bpf_prog_inc_not_zero(tx_msg);
-		if (IS_ERR(tx_msg)) {
-			if (parse && verdict) {
-				bpf_prog_put(parse);
-				bpf_prog_put(verdict);
-			}
-			return PTR_ERR(tx_msg);
-		}
-	}
-
-	psock = smap_psock_sk(sock);
-
-	/* 2. Do not allow inheriting programs if psock exists and has
-	 * already inherited programs. This would create confusion on
-	 * which parser/verdict program is running. If no psock exists
-	 * create one. Inside sk_callback_lock to ensure concurrent create
-	 * doesn't update user data.
-	 */
-	if (psock) {
-		if (!psock_is_smap_sk(sock)) {
-			err = -EBUSY;
-			goto out_progs;
-		}
-		if (READ_ONCE(psock->bpf_parse) && parse) {
-			err = -EBUSY;
-			goto out_progs;
-		}
-		if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) {
-			err = -EBUSY;
-			goto out_progs;
-		}
-		if (!refcount_inc_not_zero(&psock->refcnt)) {
-			err = -EAGAIN;
-			goto out_progs;
-		}
-	} else {
-		psock = smap_init_psock(sock, map->numa_node);
-		if (IS_ERR(psock)) {
-			err = PTR_ERR(psock);
-			goto out_progs;
-		}
-
-		set_bit(SMAP_TX_RUNNING, &psock->state);
-		new = true;
-	}
-
-	/* 3. At this point we have a reference to a valid psock that is
-	 * running. Attach any BPF programs needed.
-	 */
-	if (tx_msg)
-		bpf_tcp_msg_add(psock, sock, tx_msg);
-	if (new) {
-		err = bpf_tcp_init(sock);
-		if (err)
-			goto out_free;
-	}
-
-	if (parse && verdict && !psock->strp_enabled) {
-		err = smap_init_sock(psock, sock);
-		if (err)
-			goto out_free;
-		smap_init_progs(psock, verdict, parse);
-		write_lock_bh(&sock->sk_callback_lock);
-		smap_start_sock(psock, sock);
-		write_unlock_bh(&sock->sk_callback_lock);
-	}
-
-	return err;
-out_free:
-	smap_release_sock(psock, sock);
-out_progs:
-	if (parse && verdict) {
-		bpf_prog_put(parse);
-		bpf_prog_put(verdict);
-	}
-	if (tx_msg)
-		bpf_prog_put(tx_msg);
-	return err;
-}
-
-static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
-				    struct bpf_map *map,
-				    void *key, u64 flags)
-{
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	struct bpf_sock_progs *progs = &stab->progs;
-	struct sock *osock, *sock = skops->sk;
-	struct smap_psock_map_entry *e;
-	struct smap_psock *psock;
-	u32 i = *(u32 *)key;
-	int err;
-
-	if (unlikely(flags > BPF_EXIST))
-		return -EINVAL;
-	if (unlikely(i >= stab->map.max_entries))
-		return -E2BIG;
-
-	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-	if (!e)
-		return -ENOMEM;
-
-	err = __sock_map_ctx_update_elem(map, progs, sock, key);
-	if (err)
-		goto out;
-
-	/* psock guaranteed to be present. */
-	psock = smap_psock_sk(sock);
-	raw_spin_lock_bh(&stab->lock);
-	osock = stab->sock_map[i];
-	if (osock && flags == BPF_NOEXIST) {
-		err = -EEXIST;
-		goto out_unlock;
-	}
-	if (!osock && flags == BPF_EXIST) {
-		err = -ENOENT;
-		goto out_unlock;
-	}
-
-	e->entry = &stab->sock_map[i];
-	e->map = map;
-	spin_lock_bh(&psock->maps_lock);
-	list_add_tail(&e->list, &psock->maps);
-	spin_unlock_bh(&psock->maps_lock);
-
-	stab->sock_map[i] = sock;
-	if (osock) {
-		psock = smap_psock_sk(osock);
-		smap_list_map_remove(psock, &stab->sock_map[i]);
-		smap_release_sock(psock, osock);
-	}
-	raw_spin_unlock_bh(&stab->lock);
-	return 0;
-out_unlock:
-	smap_release_sock(psock, sock);
-	raw_spin_unlock_bh(&stab->lock);
-out:
-	kfree(e);
-	return err;
-}
-
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
-{
-	struct bpf_sock_progs *progs;
-	struct bpf_prog *orig;
-
-	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
-		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
-		progs = &stab->progs;
-	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
-		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-
-		progs = &htab->progs;
-	} else {
-		return -EINVAL;
-	}
-
-	switch (type) {
-	case BPF_SK_MSG_VERDICT:
-		orig = xchg(&progs->bpf_tx_msg, prog);
-		break;
-	case BPF_SK_SKB_STREAM_PARSER:
-		orig = xchg(&progs->bpf_parse, prog);
-		break;
-	case BPF_SK_SKB_STREAM_VERDICT:
-		orig = xchg(&progs->bpf_verdict, prog);
-		break;
-	default:
-		return -EOPNOTSUPP;
-	}
-
-	if (orig)
-		bpf_prog_put(orig);
-
-	return 0;
-}
-
-int sockmap_get_from_fd(const union bpf_attr *attr, int type,
-			struct bpf_prog *prog)
-{
-	int ufd = attr->target_fd;
-	struct bpf_map *map;
-	struct fd f;
-	int err;
-
-	f = fdget(ufd);
-	map = __bpf_map_get(f);
-	if (IS_ERR(map))
-		return PTR_ERR(map);
-
-	err = sock_map_prog(map, prog, attr->attach_type);
-	fdput(f);
-	return err;
-}
-
-static void *sock_map_lookup(struct bpf_map *map, void *key)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static int sock_map_update_elem(struct bpf_map *map,
-				void *key, void *value, u64 flags)
-{
-	struct bpf_sock_ops_kern skops;
-	u32 fd = *(u32 *)value;
-	struct socket *socket;
-	int err;
-
-	socket = sockfd_lookup(fd, &err);
-	if (!socket)
-		return err;
-
-	skops.sk = socket->sk;
-	if (!skops.sk) {
-		fput(socket->file);
-		return -EINVAL;
-	}
-
-	/* ULPs are currently supported only for TCP sockets in ESTABLISHED
-	 * state.
-	 */
-	if (skops.sk->sk_type != SOCK_STREAM ||
-	    skops.sk->sk_protocol != IPPROTO_TCP ||
-	    skops.sk->sk_state != TCP_ESTABLISHED) {
-		fput(socket->file);
-		return -EOPNOTSUPP;
-	}
-
-	lock_sock(skops.sk);
-	preempt_disable();
-	rcu_read_lock();
-	err = sock_map_ctx_update_elem(&skops, map, key, flags);
-	rcu_read_unlock();
-	preempt_enable();
-	release_sock(skops.sk);
-	fput(socket->file);
-	return err;
-}
-
-static void sock_map_release(struct bpf_map *map)
-{
-	struct bpf_sock_progs *progs;
-	struct bpf_prog *orig;
-
-	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
-		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
-		progs = &stab->progs;
-	} else {
-		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-
-		progs = &htab->progs;
-	}
-
-	orig = xchg(&progs->bpf_parse, NULL);
-	if (orig)
-		bpf_prog_put(orig);
-	orig = xchg(&progs->bpf_verdict, NULL);
-	if (orig)
-		bpf_prog_put(orig);
-
-	orig = xchg(&progs->bpf_tx_msg, NULL);
-	if (orig)
-		bpf_prog_put(orig);
-}
-
-static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
-{
-	struct bpf_htab *htab;
-	int i, err;
-	u64 cost;
-
-	if (!capable(CAP_NET_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	/* check sanity of attributes */
-	if (attr->max_entries == 0 ||
-	    attr->key_size == 0 ||
-	    attr->value_size != 4 ||
-	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
-		return ERR_PTR(-EINVAL);
-
-	if (attr->key_size > MAX_BPF_STACK)
-		/* eBPF programs initialize keys on stack, so they cannot be
-		 * larger than max stack size
-		 */
-		return ERR_PTR(-E2BIG);
-
-	htab = kzalloc(sizeof(*htab), GFP_USER);
-	if (!htab)
-		return ERR_PTR(-ENOMEM);
-
-	bpf_map_init_from_attr(&htab->map, attr);
-
-	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
-	htab->elem_size = sizeof(struct htab_elem) +
-			  round_up(htab->map.key_size, 8);
-	err = -EINVAL;
-	if (htab->n_buckets == 0 ||
-	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
-		goto free_htab;
-
-	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
-	       (u64) htab->elem_size * htab->map.max_entries;
-
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_htab;
-
-	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-	err = bpf_map_precharge_memlock(htab->map.pages);
-	if (err)
-		goto free_htab;
-
-	err = -ENOMEM;
-	htab->buckets = bpf_map_area_alloc(
-				htab->n_buckets * sizeof(struct bucket),
-				htab->map.numa_node);
-	if (!htab->buckets)
-		goto free_htab;
-
-	for (i = 0; i < htab->n_buckets; i++) {
-		INIT_HLIST_HEAD(&htab->buckets[i].head);
-		raw_spin_lock_init(&htab->buckets[i].lock);
-	}
-
-	return &htab->map;
-free_htab:
-	kfree(htab);
-	return ERR_PTR(err);
-}
-
-static void __bpf_htab_free(struct rcu_head *rcu)
-{
-	struct bpf_htab *htab;
-
-	htab = container_of(rcu, struct bpf_htab, rcu);
-	bpf_map_area_free(htab->buckets);
-	kfree(htab);
-}
-
-static void sock_hash_free(struct bpf_map *map)
-{
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	int i;
-
-	synchronize_rcu();
-
-	/* At this point no update, lookup or delete operations can happen.
-	 * However, be aware we can still get a socket state event updates,
-	 * and data ready callabacks that reference the psock from sk_user_data
-	 * Also psock worker threads are still in-flight. So smap_release_sock
-	 * will only free the psock after cancel_sync on the worker threads
-	 * and a grace period expire to ensure psock is really safe to remove.
-	 */
-	rcu_read_lock();
-	for (i = 0; i < htab->n_buckets; i++) {
-		struct bucket *b = __select_bucket(htab, i);
-		struct hlist_head *head;
-		struct hlist_node *n;
-		struct htab_elem *l;
-
-		raw_spin_lock_bh(&b->lock);
-		head = &b->head;
-		hlist_for_each_entry_safe(l, n, head, hash_node) {
-			struct sock *sock = l->sk;
-			struct smap_psock *psock;
-
-			hlist_del_rcu(&l->hash_node);
-			psock = smap_psock_sk(sock);
-			/* This check handles a racing sock event that can get
-			 * the sk_callback_lock before this case but after xchg
-			 * causing the refcnt to hit zero and sock user data
-			 * (psock) to be null and queued for garbage collection.
-			 */
-			if (likely(psock)) {
-				smap_list_hash_remove(psock, l);
-				smap_release_sock(psock, sock);
-			}
-			free_htab_elem(htab, l);
-		}
-		raw_spin_unlock_bh(&b->lock);
-	}
-	rcu_read_unlock();
-	call_rcu(&htab->rcu, __bpf_htab_free);
-}
-
-static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
-					      void *key, u32 key_size, u32 hash,
-					      struct sock *sk,
-					      struct htab_elem *old_elem)
-{
-	struct htab_elem *l_new;
-
-	if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
-		if (!old_elem) {
-			atomic_dec(&htab->count);
-			return ERR_PTR(-E2BIG);
-		}
-	}
-	l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
-			     htab->map.numa_node);
-	if (!l_new) {
-		atomic_dec(&htab->count);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	memcpy(l_new->key, key, key_size);
-	l_new->sk = sk;
-	l_new->hash = hash;
-	return l_new;
-}
-
-static inline u32 htab_map_hash(const void *key, u32 key_len)
-{
-	return jhash(key, key_len, 0);
-}
-
-static int sock_hash_get_next_key(struct bpf_map *map,
-				  void *key, void *next_key)
-{
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct htab_elem *l, *next_l;
-	struct hlist_head *h;
-	u32 hash, key_size;
-	int i = 0;
-
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	key_size = map->key_size;
-	if (!key)
-		goto find_first_elem;
-	hash = htab_map_hash(key, key_size);
-	h = select_bucket(htab, hash);
-
-	l = lookup_elem_raw(h, hash, key, key_size);
-	if (!l)
-		goto find_first_elem;
-	next_l = hlist_entry_safe(
-		     rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
-		     struct htab_elem, hash_node);
-	if (next_l) {
-		memcpy(next_key, next_l->key, key_size);
-		return 0;
-	}
-
-	/* no more elements in this hash list, go to the next bucket */
-	i = hash & (htab->n_buckets - 1);
-	i++;
-
-find_first_elem:
-	/* iterate over buckets */
-	for (; i < htab->n_buckets; i++) {
-		h = select_bucket(htab, i);
-
-		/* pick first element in the bucket */
-		next_l = hlist_entry_safe(
-				rcu_dereference_raw(hlist_first_rcu(h)),
-				struct htab_elem, hash_node);
-		if (next_l) {
-			/* if it's not empty, just return it */
-			memcpy(next_key, next_l->key, key_size);
-			return 0;
-		}
-	}
-
-	/* iterated over all buckets and all elements */
-	return -ENOENT;
-}
-
-static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
-				     struct bpf_map *map,
-				     void *key, u64 map_flags)
-{
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct bpf_sock_progs *progs = &htab->progs;
-	struct htab_elem *l_new = NULL, *l_old;
-	struct smap_psock_map_entry *e = NULL;
-	struct hlist_head *head;
-	struct smap_psock *psock;
-	u32 key_size, hash;
-	struct sock *sock;
-	struct bucket *b;
-	int err;
-
-	sock = skops->sk;
-
-	if (sock->sk_type != SOCK_STREAM ||
-	    sock->sk_protocol != IPPROTO_TCP)
-		return -EOPNOTSUPP;
-
-	if (unlikely(map_flags > BPF_EXIST))
-		return -EINVAL;
-
-	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-	if (!e)
-		return -ENOMEM;
-
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	key_size = map->key_size;
-	hash = htab_map_hash(key, key_size);
-	b = __select_bucket(htab, hash);
-	head = &b->head;
-
-	err = __sock_map_ctx_update_elem(map, progs, sock, key);
-	if (err)
-		goto err;
-
-	/* psock is valid here because otherwise above *ctx_update_elem would
-	 * have thrown an error. It is safe to skip error check.
-	 */
-	psock = smap_psock_sk(sock);
-	raw_spin_lock_bh(&b->lock);
-	l_old = lookup_elem_raw(head, hash, key, key_size);
-	if (l_old && map_flags == BPF_NOEXIST) {
-		err = -EEXIST;
-		goto bucket_err;
-	}
-	if (!l_old && map_flags == BPF_EXIST) {
-		err = -ENOENT;
-		goto bucket_err;
-	}
-
-	l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
-	if (IS_ERR(l_new)) {
-		err = PTR_ERR(l_new);
-		goto bucket_err;
-	}
-
-	rcu_assign_pointer(e->hash_link, l_new);
-	e->map = map;
-	spin_lock_bh(&psock->maps_lock);
-	list_add_tail(&e->list, &psock->maps);
-	spin_unlock_bh(&psock->maps_lock);
-
-	/* add new element to the head of the list, so that
-	 * concurrent search will find it before old elem
-	 */
-	hlist_add_head_rcu(&l_new->hash_node, head);
-	if (l_old) {
-		psock = smap_psock_sk(l_old->sk);
-
-		hlist_del_rcu(&l_old->hash_node);
-		smap_list_hash_remove(psock, l_old);
-		smap_release_sock(psock, l_old->sk);
-		free_htab_elem(htab, l_old);
-	}
-	raw_spin_unlock_bh(&b->lock);
-	return 0;
-bucket_err:
-	smap_release_sock(psock, sock);
-	raw_spin_unlock_bh(&b->lock);
-err:
-	kfree(e);
-	return err;
-}
-
-static int sock_hash_update_elem(struct bpf_map *map,
-				void *key, void *value, u64 flags)
-{
-	struct bpf_sock_ops_kern skops;
-	u32 fd = *(u32 *)value;
-	struct socket *socket;
-	int err;
-
-	socket = sockfd_lookup(fd, &err);
-	if (!socket)
-		return err;
-
-	skops.sk = socket->sk;
-	if (!skops.sk) {
-		fput(socket->file);
-		return -EINVAL;
-	}
-
-	/* ULPs are currently supported only for TCP sockets in ESTABLISHED
-	 * state.
-	 */
-	if (skops.sk->sk_type != SOCK_STREAM ||
-	    skops.sk->sk_protocol != IPPROTO_TCP ||
-	    skops.sk->sk_state != TCP_ESTABLISHED) {
-		fput(socket->file);
-		return -EOPNOTSUPP;
-	}
-
-	lock_sock(skops.sk);
-	preempt_disable();
-	rcu_read_lock();
-	err = sock_hash_ctx_update_elem(&skops, map, key, flags);
-	rcu_read_unlock();
-	preempt_enable();
-	release_sock(skops.sk);
-	fput(socket->file);
-	return err;
-}
-
-static int sock_hash_delete_elem(struct bpf_map *map, void *key)
-{
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct hlist_head *head;
-	struct bucket *b;
-	struct htab_elem *l;
-	u32 hash, key_size;
-	int ret = -ENOENT;
-
-	key_size = map->key_size;
-	hash = htab_map_hash(key, key_size);
-	b = __select_bucket(htab, hash);
-	head = &b->head;
-
-	raw_spin_lock_bh(&b->lock);
-	l = lookup_elem_raw(head, hash, key, key_size);
-	if (l) {
-		struct sock *sock = l->sk;
-		struct smap_psock *psock;
-
-		hlist_del_rcu(&l->hash_node);
-		psock = smap_psock_sk(sock);
-		/* This check handles a racing sock event that can get the
-		 * sk_callback_lock before this case but after xchg happens
-		 * causing the refcnt to hit zero and sock user data (psock)
-		 * to be null and queued for garbage collection.
-		 */
-		if (likely(psock)) {
-			smap_list_hash_remove(psock, l);
-			smap_release_sock(psock, sock);
-		}
-		free_htab_elem(htab, l);
-		ret = 0;
-	}
-	raw_spin_unlock_bh(&b->lock);
-	return ret;
-}
-
-struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
-{
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct hlist_head *head;
-	struct htab_elem *l;
-	u32 key_size, hash;
-	struct bucket *b;
-	struct sock *sk;
-
-	key_size = map->key_size;
-	hash = htab_map_hash(key, key_size);
-	b = __select_bucket(htab, hash);
-	head = &b->head;
-
-	l = lookup_elem_raw(head, hash, key, key_size);
-	sk = l ? l->sk : NULL;
-	return sk;
-}
-
-const struct bpf_map_ops sock_map_ops = {
-	.map_alloc = sock_map_alloc,
-	.map_free = sock_map_free,
-	.map_lookup_elem = sock_map_lookup,
-	.map_get_next_key = sock_map_get_next_key,
-	.map_update_elem = sock_map_update_elem,
-	.map_delete_elem = sock_map_delete_elem,
-	.map_release_uref = sock_map_release,
-	.map_check_btf = map_check_no_btf,
-};
-
-const struct bpf_map_ops sock_hash_ops = {
-	.map_alloc = sock_hash_alloc,
-	.map_free = sock_hash_free,
-	.map_lookup_elem = sock_map_lookup,
-	.map_get_next_key = sock_hash_get_next_key,
-	.map_update_elem = sock_hash_update_elem,
-	.map_delete_elem = sock_hash_delete_elem,
-	.map_release_uref = sock_map_release,
-	.map_check_btf = map_check_no_btf,
-};
-
-static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops)
-{
-	return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
-	       ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
-}
-BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
-	   struct bpf_map *, map, void *, key, u64, flags)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	/* ULPs are currently supported only for TCP sockets in ESTABLISHED
-	 * state. This checks that the sock ops triggering the update is
-	 * one indicating we are (or will be soon) in an ESTABLISHED state.
-	 */
-	if (!bpf_is_valid_sock_op(bpf_sock))
-		return -EOPNOTSUPP;
-	return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
-}
-
-const struct bpf_func_proto bpf_sock_map_update_proto = {
-	.func		= bpf_sock_map_update,
-	.gpl_only	= false,
-	.pkt_access	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_CONST_MAP_PTR,
-	.arg3_type	= ARG_PTR_TO_MAP_KEY,
-	.arg4_type	= ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
-	   struct bpf_map *, map, void *, key, u64, flags)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	if (!bpf_is_valid_sock_op(bpf_sock))
-		return -EOPNOTSUPP;
-	return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
-}
-
-const struct bpf_func_proto bpf_sock_hash_update_proto = {
-	.func		= bpf_sock_hash_update,
-	.gpl_only	= false,
-	.pkt_access	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_CONST_MAP_PTR,
-	.arg3_type	= ARG_PTR_TO_MAP_KEY,
-	.arg4_type	= ARG_ANYTHING,
-};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 53968f82b919..f4ecd6ed2252 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1664,7 +1664,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	switch (ptype) {
 	case BPF_PROG_TYPE_SK_SKB:
 	case BPF_PROG_TYPE_SK_MSG:
-		ret = sockmap_get_from_fd(attr, ptype, prog);
+		ret = sock_map_get_from_fd(attr, prog);
 		break;
 	case BPF_PROG_TYPE_LIRC_MODE2:
 		ret = lirc_prog_attach(attr, prog);
@@ -1718,10 +1718,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
 		break;
 	case BPF_SK_MSG_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
+		return sock_map_get_from_fd(attr, NULL);
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
+		return sock_map_get_from_fd(attr, NULL);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_detach(attr);
 	case BPF_FLOW_DISSECTOR:
diff --git a/net/Kconfig b/net/Kconfig
index 228dfa382eec..f235edb593ba 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -300,8 +300,11 @@ config BPF_JIT
 
 config BPF_STREAM_PARSER
 	bool "enable BPF STREAM_PARSER"
+	depends on INET
 	depends on BPF_SYSCALL
+	depends on CGROUP_BPF
 	select STREAM_PARSER
+	select NET_SOCK_MSG
 	---help---
 	 Enabling this allows a stream parser to be used with
 	 BPF_MAP_TYPE_SOCKMAP.
@@ -413,6 +416,14 @@ config GRO_CELLS
 config SOCK_VALIDATE_XMIT
 	bool
 
+config NET_SOCK_MSG
+	bool
+	default n
+	help
+	  The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or
+	  ULPs (upper layer modules, e.g. TLS) to process L7 application data
+	  with the help of BPF programs.
+
 config NET_DEVLINK
 	tristate "Network physical/parent device Netlink interface"
 	help
diff --git a/net/core/Makefile b/net/core/Makefile
index 80175e6a2eb8..fccd31e0e7f7 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,6 +16,7 @@ obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 obj-y += net-sysfs.o
 obj-$(CONFIG_PAGE_POOL) += page_pool.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
 obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
+obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/filter.c b/net/core/filter.c
index b844761b5d4c..0f5260b04bfe 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -38,6 +38,7 @@
 #include <net/protocol.h>
 #include <net/netlink.h>
 #include <linux/skbuff.h>
+#include <linux/skmsg.h>
 #include <net/sock.h>
 #include <net/flow_dissector.h>
 #include <linux/errno.h>
@@ -2142,123 +2143,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
-	   struct bpf_map *, map, void *, key, u64, flags)
-{
-	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
-	/* If user passes invalid input drop the packet. */
-	if (unlikely(flags & ~(BPF_F_INGRESS)))
-		return SK_DROP;
-
-	tcb->bpf.flags = flags;
-	tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
-	if (!tcb->bpf.sk_redir)
-		return SK_DROP;
-
-	return SK_PASS;
-}
-
-static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
-	.func           = bpf_sk_redirect_hash,
-	.gpl_only       = false,
-	.ret_type       = RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_CONST_MAP_PTR,
-	.arg3_type      = ARG_PTR_TO_MAP_KEY,
-	.arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
-	   struct bpf_map *, map, u32, key, u64, flags)
-{
-	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
-	/* If user passes invalid input drop the packet. */
-	if (unlikely(flags & ~(BPF_F_INGRESS)))
-		return SK_DROP;
-
-	tcb->bpf.flags = flags;
-	tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
-	if (!tcb->bpf.sk_redir)
-		return SK_DROP;
-
-	return SK_PASS;
-}
-
-struct sock *do_sk_redirect_map(struct sk_buff *skb)
-{
-	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
-	return tcb->bpf.sk_redir;
-}
-
-static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
-	.func           = bpf_sk_redirect_map,
-	.gpl_only       = false,
-	.ret_type       = RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_CONST_MAP_PTR,
-	.arg3_type      = ARG_ANYTHING,
-	.arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
-	   struct bpf_map *, map, void *, key, u64, flags)
-{
-	/* If user passes invalid input drop the packet. */
-	if (unlikely(flags & ~(BPF_F_INGRESS)))
-		return SK_DROP;
-
-	msg->flags = flags;
-	msg->sk_redir = __sock_hash_lookup_elem(map, key);
-	if (!msg->sk_redir)
-		return SK_DROP;
-
-	return SK_PASS;
-}
-
-static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
-	.func           = bpf_msg_redirect_hash,
-	.gpl_only       = false,
-	.ret_type       = RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_CONST_MAP_PTR,
-	.arg3_type      = ARG_PTR_TO_MAP_KEY,
-	.arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
-	   struct bpf_map *, map, u32, key, u64, flags)
-{
-	/* If user passes invalid input drop the packet. */
-	if (unlikely(flags & ~(BPF_F_INGRESS)))
-		return SK_DROP;
-
-	msg->flags = flags;
-	msg->sk_redir = __sock_map_lookup_elem(map, key);
-	if (!msg->sk_redir)
-		return SK_DROP;
-
-	return SK_PASS;
-}
-
-struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
-{
-	return msg->sk_redir;
-}
-
-static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
-	.func           = bpf_msg_redirect_map,
-	.gpl_only       = false,
-	.ret_type       = RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_CONST_MAP_PTR,
-	.arg3_type      = ARG_ANYTHING,
-	.arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
 {
 	msg->apply_bytes = bytes;
 	return 0;
@@ -2272,7 +2157,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
 {
 	msg->cork_bytes = bytes;
 	return 0;
@@ -2286,45 +2171,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-#define sk_msg_iter_var(var)			\
-	do {					\
-		var++;				\
-		if (var == MAX_SKB_FRAGS)	\
-			var = 0;		\
-	} while (0)
-
-BPF_CALL_4(bpf_msg_pull_data,
-	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
+	   u32, end, u64, flags)
 {
-	unsigned int len = 0, offset = 0, copy = 0, poffset = 0;
-	int bytes = end - start, bytes_sg_total;
-	struct scatterlist *sg = msg->sg_data;
-	int first_sg, last_sg, i, shift;
-	unsigned char *p, *to, *from;
+	u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
+	u32 first_sge, last_sge, i, shift, bytes_sg_total;
+	struct scatterlist *sge;
+	u8 *raw, *to, *from;
 	struct page *page;
 
 	if (unlikely(flags || end <= start))
 		return -EINVAL;
 
 	/* First find the starting scatterlist element */
-	i = msg->sg_start;
+	i = msg->sg.start;
 	do {
-		len = sg[i].length;
+		len = sk_msg_elem(msg, i)->length;
 		if (start < offset + len)
 			break;
 		offset += len;
-		sk_msg_iter_var(i);
-	} while (i != msg->sg_end);
+		sk_msg_iter_var_next(i);
+	} while (i != msg->sg.end);
 
 	if (unlikely(start >= offset + len))
 		return -EINVAL;
 
-	first_sg = i;
+	first_sge = i;
 	/* The start may point into the sg element so we need to also
 	 * account for the headroom.
 	 */
 	bytes_sg_total = start - offset + bytes;
-	if (!msg->sg_copy[i] && bytes_sg_total <= len)
+	if (!msg->sg.copy[i] && bytes_sg_total <= len)
 		goto out;
 
 	/* At this point we need to linearize multiple scatterlist
@@ -2338,12 +2215,12 @@ BPF_CALL_4(bpf_msg_pull_data,
 	 * will copy the entire sg entry.
 	 */
 	do {
-		copy += sg[i].length;
-		sk_msg_iter_var(i);
+		copy += sk_msg_elem(msg, i)->length;
+		sk_msg_iter_var_next(i);
 		if (bytes_sg_total <= copy)
 			break;
-	} while (i != msg->sg_end);
-	last_sg = i;
+	} while (i != msg->sg.end);
+	last_sge = i;
 
 	if (unlikely(bytes_sg_total > copy))
 		return -EINVAL;
@@ -2352,63 +2229,61 @@ BPF_CALL_4(bpf_msg_pull_data,
 			   get_order(copy));
 	if (unlikely(!page))
 		return -ENOMEM;
-	p = page_address(page);
 
-	i = first_sg;
+	raw = page_address(page);
+	i = first_sge;
 	do {
-		from = sg_virt(&sg[i]);
-		len = sg[i].length;
-		to = p + poffset;
+		sge = sk_msg_elem(msg, i);
+		from = sg_virt(sge);
+		len = sge->length;
+		to = raw + poffset;
 
 		memcpy(to, from, len);
 		poffset += len;
-		sg[i].length = 0;
-		put_page(sg_page(&sg[i]));
+		sge->length = 0;
+		put_page(sg_page(sge));
 
-		sk_msg_iter_var(i);
-	} while (i != last_sg);
+		sk_msg_iter_var_next(i);
+	} while (i != last_sge);
 
-	sg[first_sg].length = copy;
-	sg_set_page(&sg[first_sg], page, copy, 0);
+	sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
 
 	/* To repair sg ring we need to shift entries. If we only
 	 * had a single entry though we can just replace it and
 	 * be done. Otherwise walk the ring and shift the entries.
 	 */
-	WARN_ON_ONCE(last_sg == first_sg);
-	shift = last_sg > first_sg ?
-		last_sg - first_sg - 1 :
-		MAX_SKB_FRAGS - first_sg + last_sg - 1;
+	WARN_ON_ONCE(last_sge == first_sge);
+	shift = last_sge > first_sge ?
+		last_sge - first_sge - 1 :
+		MAX_SKB_FRAGS - first_sge + last_sge - 1;
 	if (!shift)
 		goto out;
 
-	i = first_sg;
-	sk_msg_iter_var(i);
+	i = first_sge;
+	sk_msg_iter_var_next(i);
 	do {
-		int move_from;
+		u32 move_from;
 
-		if (i + shift >= MAX_SKB_FRAGS)
-			move_from = i + shift - MAX_SKB_FRAGS;
+		if (i + shift >= MAX_MSG_FRAGS)
+			move_from = i + shift - MAX_MSG_FRAGS;
 		else
 			move_from = i + shift;
-
-		if (move_from == msg->sg_end)
+		if (move_from == msg->sg.end)
 			break;
 
-		sg[i] = sg[move_from];
-		sg[move_from].length = 0;
-		sg[move_from].page_link = 0;
-		sg[move_from].offset = 0;
-
-		sk_msg_iter_var(i);
+		msg->sg.data[i] = msg->sg.data[move_from];
+		msg->sg.data[move_from].length = 0;
+		msg->sg.data[move_from].page_link = 0;
+		msg->sg.data[move_from].offset = 0;
+		sk_msg_iter_var_next(i);
 	} while (1);
-	msg->sg_end -= shift;
-	if (msg->sg_end < 0)
-		msg->sg_end += MAX_SKB_FRAGS;
+
+	msg->sg.end = msg->sg.end - shift > msg->sg.end ?
+		      msg->sg.end - shift + MAX_MSG_FRAGS :
+		      msg->sg.end - shift;
 out:
-	msg->data = sg_virt(&sg[first_sg]) + start - offset;
+	msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
 	msg->data_end = msg->data + bytes;
-
 	return 0;
 }
 
@@ -5203,6 +5078,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+
 static const struct bpf_func_proto *
 sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -5226,6 +5104,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
+
 static const struct bpf_func_proto *
 sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -5247,6 +5128,9 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
+
 static const struct bpf_func_proto *
 sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -7001,22 +6885,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 
 	switch (si->off) {
 	case offsetof(struct sk_msg_md, data):
-		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, data));
+				      offsetof(struct sk_msg, data));
 		break;
 	case offsetof(struct sk_msg_md, data_end):
-		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, data_end));
+				      offsetof(struct sk_msg, data_end));
 		break;
 	case offsetof(struct sk_msg_md, family):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-					      struct sk_msg_buff, sk),
+					      struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_family));
 		break;
@@ -7025,9 +6909,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-						struct sk_msg_buff, sk),
+						struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_daddr));
 		break;
@@ -7037,9 +6921,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 					  skc_rcv_saddr) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-					      struct sk_msg_buff, sk),
+					      struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common,
 					       skc_rcv_saddr));
@@ -7054,9 +6938,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		off = si->off;
 		off -= offsetof(struct sk_msg_md, remote_ip6[0]);
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-						struct sk_msg_buff, sk),
+						struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common,
 					       skc_v6_daddr.s6_addr32[0]) +
@@ -7075,9 +6959,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		off = si->off;
 		off -= offsetof(struct sk_msg_md, local_ip6[0]);
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-						struct sk_msg_buff, sk),
+						struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common,
 					       skc_v6_rcv_saddr.s6_addr32[0]) +
@@ -7091,9 +6975,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-						struct sk_msg_buff, sk),
+						struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_dport));
 #ifndef __BIG_ENDIAN_BITFIELD
@@ -7105,9 +6989,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-						struct sk_msg_buff, sk),
+						struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_num));
 		break;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
new file mode 100644
index 000000000000..ae2b281c9c57
--- /dev/null
+++ b/net/core/skmsg.c
@@ -0,0 +1,763 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
+{
+	if (msg->sg.end > msg->sg.start &&
+	    elem_first_coalesce < msg->sg.end)
+		return true;
+
+	if (msg->sg.end < msg->sg.start &&
+	    (elem_first_coalesce > msg->sg.start ||
+	     elem_first_coalesce < msg->sg.end))
+		return true;
+
+	return false;
+}
+
+int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
+		 int elem_first_coalesce)
+{
+	struct page_frag *pfrag = sk_page_frag(sk);
+	int ret = 0;
+
+	len -= msg->sg.size;
+	while (len > 0) {
+		struct scatterlist *sge;
+		u32 orig_offset;
+		int use, i;
+
+		if (!sk_page_frag_refill(sk, pfrag))
+			return -ENOMEM;
+
+		orig_offset = pfrag->offset;
+		use = min_t(int, len, pfrag->size - orig_offset);
+		if (!sk_wmem_schedule(sk, use))
+			return -ENOMEM;
+
+		i = msg->sg.end;
+		sk_msg_iter_var_prev(i);
+		sge = &msg->sg.data[i];
+
+		if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
+		    sg_page(sge) == pfrag->page &&
+		    sge->offset + sge->length == orig_offset) {
+			sge->length += use;
+		} else {
+			if (sk_msg_full(msg)) {
+				ret = -ENOSPC;
+				break;
+			}
+
+			sge = &msg->sg.data[msg->sg.end];
+			sg_unmark_end(sge);
+			sg_set_page(sge, pfrag->page, use, orig_offset);
+			get_page(pfrag->page);
+			sk_msg_iter_next(msg, end);
+		}
+
+		sk_mem_charge(sk, use);
+		msg->sg.size += use;
+		pfrag->offset += use;
+		len -= use;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_alloc);
+
+void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+	int i = msg->sg.start;
+
+	do {
+		struct scatterlist *sge = sk_msg_elem(msg, i);
+
+		if (bytes < sge->length) {
+			sge->length -= bytes;
+			sge->offset += bytes;
+			sk_mem_uncharge(sk, bytes);
+			break;
+		}
+
+		sk_mem_uncharge(sk, sge->length);
+		bytes -= sge->length;
+		sge->length = 0;
+		sge->offset = 0;
+		sk_msg_iter_var_next(i);
+	} while (bytes && i != msg->sg.end);
+	msg->sg.start = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_return_zero);
+
+void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+	int i = msg->sg.start;
+
+	do {
+		struct scatterlist *sge = &msg->sg.data[i];
+		int uncharge = (bytes < sge->length) ? bytes : sge->length;
+
+		sk_mem_uncharge(sk, uncharge);
+		bytes -= uncharge;
+		sk_msg_iter_var_next(i);
+	} while (i != msg->sg.end);
+}
+EXPORT_SYMBOL_GPL(sk_msg_return);
+
+static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
+			    bool charge)
+{
+	struct scatterlist *sge = sk_msg_elem(msg, i);
+	u32 len = sge->length;
+
+	if (charge)
+		sk_mem_uncharge(sk, len);
+	if (!msg->skb)
+		put_page(sg_page(sge));
+	memset(sge, 0, sizeof(*sge));
+	return len;
+}
+
+static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
+			 bool charge)
+{
+	struct scatterlist *sge = sk_msg_elem(msg, i);
+	int freed = 0;
+
+	while (msg->sg.size) {
+		msg->sg.size -= sge->length;
+		freed += sk_msg_free_elem(sk, msg, i, charge);
+		sk_msg_iter_var_next(i);
+		sk_msg_check_to_free(msg, i, msg->sg.size);
+		sge = sk_msg_elem(msg, i);
+	}
+	if (msg->skb)
+		consume_skb(msg->skb);
+	sk_msg_init(msg);
+	return freed;
+}
+
+int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
+{
+	return __sk_msg_free(sk, msg, msg->sg.start, false);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
+
+int sk_msg_free(struct sock *sk, struct sk_msg *msg)
+{
+	return __sk_msg_free(sk, msg, msg->sg.start, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free);
+
+static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
+				  u32 bytes, bool charge)
+{
+	struct scatterlist *sge;
+	u32 i = msg->sg.start;
+
+	while (bytes) {
+		sge = sk_msg_elem(msg, i);
+		if (!sge->length)
+			break;
+		if (bytes < sge->length) {
+			if (charge)
+				sk_mem_uncharge(sk, bytes);
+			sge->length -= bytes;
+			sge->offset += bytes;
+			msg->sg.size -= bytes;
+			break;
+		}
+
+		msg->sg.size -= sge->length;
+		bytes -= sge->length;
+		sk_msg_free_elem(sk, msg, i, charge);
+		sk_msg_iter_var_next(i);
+		sk_msg_check_to_free(msg, i, bytes);
+	}
+	msg->sg.start = i;
+}
+
+void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
+{
+	__sk_msg_free_partial(sk, msg, bytes, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_partial);
+
+void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
+				  u32 bytes)
+{
+	__sk_msg_free_partial(sk, msg, bytes, false);
+}
+
+void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
+{
+	int trim = msg->sg.size - len;
+	u32 i = msg->sg.end;
+
+	if (trim <= 0) {
+		WARN_ON(trim < 0);
+		return;
+	}
+
+	sk_msg_iter_var_prev(i);
+	msg->sg.size = len;
+	while (msg->sg.data[i].length &&
+	       trim >= msg->sg.data[i].length) {
+		trim -= msg->sg.data[i].length;
+		sk_msg_free_elem(sk, msg, i, true);
+		sk_msg_iter_var_prev(i);
+		if (!trim)
+			goto out;
+	}
+
+	msg->sg.data[i].length -= trim;
+	sk_mem_uncharge(sk, trim);
+out:
+	/* If we trim data before curr pointer update copybreak and current
+	 * so that any future copy operations start at new copy location.
+	 * However trimed data that has not yet been used in a copy op
+	 * does not require an update.
+	 */
+	if (msg->sg.curr >= i) {
+		msg->sg.curr = i;
+		msg->sg.copybreak = msg->sg.data[i].length;
+	}
+	sk_msg_iter_var_next(i);
+	msg->sg.end = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_trim);
+
+int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+			      struct sk_msg *msg, u32 bytes)
+{
+	int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
+	const int to_max_pages = MAX_MSG_FRAGS;
+	struct page *pages[MAX_MSG_FRAGS];
+	ssize_t orig, copied, use, offset;
+
+	orig = msg->sg.size;
+	while (bytes > 0) {
+		i = 0;
+		maxpages = to_max_pages - num_elems;
+		if (maxpages == 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		copied = iov_iter_get_pages(from, pages, bytes, maxpages,
+					    &offset);
+		if (copied <= 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		iov_iter_advance(from, copied);
+		bytes -= copied;
+		msg->sg.size += copied;
+
+		while (copied) {
+			use = min_t(int, copied, PAGE_SIZE - offset);
+			sg_set_page(&msg->sg.data[msg->sg.end],
+				    pages[i], use, offset);
+			sg_unmark_end(&msg->sg.data[msg->sg.end]);
+			sk_mem_charge(sk, use);
+
+			offset = 0;
+			copied -= use;
+			sk_msg_iter_next(msg, end);
+			num_elems++;
+			i++;
+		}
+		/* When zerocopy is mixed with sk_msg_*copy* operations we
+		 * may have a copybreak set in this case clear and prefer
+		 * zerocopy remainder when possible.
+		 */
+		msg->sg.copybreak = 0;
+		msg->sg.curr = msg->sg.end;
+	}
+out:
+	/* Revert iov_iter updates, msg will need to use 'trim' later if it
+	 * also needs to be cleared.
+	 */
+	if (ret)
+		iov_iter_revert(from, msg->sg.size - orig);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
+
+int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+			     struct sk_msg *msg, u32 bytes)
+{
+	int ret = -ENOSPC, i = msg->sg.curr;
+	struct scatterlist *sge;
+	u32 copy, buf_size;
+	void *to;
+
+	do {
+		sge = sk_msg_elem(msg, i);
+		/* This is possible if a trim operation shrunk the buffer */
+		if (msg->sg.copybreak >= sge->length) {
+			msg->sg.copybreak = 0;
+			sk_msg_iter_var_next(i);
+			if (i == msg->sg.end)
+				break;
+			sge = sk_msg_elem(msg, i);
+		}
+
+		buf_size = sge->length - msg->sg.copybreak;
+		copy = (buf_size > bytes) ? bytes : buf_size;
+		to = sg_virt(sge) + msg->sg.copybreak;
+		msg->sg.copybreak += copy;
+		if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
+			ret = copy_from_iter_nocache(to, copy, from);
+		else
+			ret = copy_from_iter(to, copy, from);
+		if (ret != copy) {
+			ret = -EFAULT;
+			goto out;
+		}
+		bytes -= copy;
+		if (!bytes)
+			break;
+		msg->sg.copybreak = 0;
+		sk_msg_iter_var_next(i);
+	} while (i != msg->sg.end);
+out:
+	msg->sg.curr = i;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
+
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+{
+	struct sock *sk = psock->sk;
+	int copied = 0, num_sge;
+	struct sk_msg *msg;
+
+	msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+	if (unlikely(!msg))
+		return -EAGAIN;
+	if (!sk_rmem_schedule(sk, skb, skb->len)) {
+		kfree(msg);
+		return -EAGAIN;
+	}
+
+	sk_msg_init(msg);
+	num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
+	if (unlikely(num_sge < 0)) {
+		kfree(msg);
+		return num_sge;
+	}
+
+	sk_mem_charge(sk, skb->len);
+	copied = skb->len;
+	msg->sg.start = 0;
+	msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
+	msg->skb = skb;
+
+	sk_psock_queue_msg(psock, msg);
+	sk->sk_data_ready(sk);
+	return copied;
+}
+
+static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
+			       u32 off, u32 len, bool ingress)
+{
+	if (ingress)
+		return sk_psock_skb_ingress(psock, skb);
+	else
+		return skb_send_sock_locked(psock->sk, skb, off, len);
+}
+
+static void sk_psock_backlog(struct work_struct *work)
+{
+	struct sk_psock *psock = container_of(work, struct sk_psock, work);
+	struct sk_psock_work_state *state = &psock->work_state;
+	struct sk_buff *skb;
+	bool ingress;
+	u32 len, off;
+	int ret;
+
+	/* Lock sock to avoid losing sk_socket during loop. */
+	lock_sock(psock->sk);
+	if (state->skb) {
+		skb = state->skb;
+		len = state->len;
+		off = state->off;
+		state->skb = NULL;
+		goto start;
+	}
+
+	while ((skb = skb_dequeue(&psock->ingress_skb))) {
+		len = skb->len;
+		off = 0;
+start:
+		ingress = tcp_skb_bpf_ingress(skb);
+		do {
+			ret = -EIO;
+			if (likely(psock->sk->sk_socket))
+				ret = sk_psock_handle_skb(psock, skb, off,
+							  len, ingress);
+			if (ret <= 0) {
+				if (ret == -EAGAIN) {
+					state->skb = skb;
+					state->len = len;
+					state->off = off;
+					goto end;
+				}
+				/* Hard errors break pipe and stop xmit. */
+				sk_psock_report_error(psock, ret ? -ret : EPIPE);
+				sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+				kfree_skb(skb);
+				goto end;
+			}
+			off += ret;
+			len -= ret;
+		} while (len);
+
+		if (!ingress)
+			kfree_skb(skb);
+	}
+end:
+	release_sock(psock->sk);
+}
+
+struct sk_psock *sk_psock_init(struct sock *sk, int node)
+{
+	struct sk_psock *psock = kzalloc_node(sizeof(*psock),
+					      GFP_ATOMIC | __GFP_NOWARN,
+					      node);
+	if (!psock)
+		return NULL;
+
+	psock->sk = sk;
+	psock->eval =  __SK_NONE;
+
+	INIT_LIST_HEAD(&psock->link);
+	spin_lock_init(&psock->link_lock);
+
+	INIT_WORK(&psock->work, sk_psock_backlog);
+	INIT_LIST_HEAD(&psock->ingress_msg);
+	skb_queue_head_init(&psock->ingress_skb);
+
+	sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
+	refcount_set(&psock->refcnt, 1);
+
+	rcu_assign_sk_user_data(sk, psock);
+	sock_hold(sk);
+
+	return psock;
+}
+EXPORT_SYMBOL_GPL(sk_psock_init);
+
+struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
+{
+	struct sk_psock_link *link;
+
+	spin_lock_bh(&psock->link_lock);
+	link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
+					list);
+	if (link)
+		list_del(&link->list);
+	spin_unlock_bh(&psock->link_lock);
+	return link;
+}
+
+void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+{
+	struct sk_msg *msg, *tmp;
+
+	list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
+		list_del(&msg->list);
+		sk_msg_free(psock->sk, msg);
+		kfree(msg);
+	}
+}
+
+static void sk_psock_zap_ingress(struct sk_psock *psock)
+{
+	__skb_queue_purge(&psock->ingress_skb);
+	__sk_psock_purge_ingress_msg(psock);
+}
+
+static void sk_psock_link_destroy(struct sk_psock *psock)
+{
+	struct sk_psock_link *link, *tmp;
+
+	list_for_each_entry_safe(link, tmp, &psock->link, list) {
+		list_del(&link->list);
+		sk_psock_free_link(link);
+	}
+}
+
+static void sk_psock_destroy_deferred(struct work_struct *gc)
+{
+	struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
+
+	/* No sk_callback_lock since already detached. */
+	if (psock->parser.enabled)
+		strp_done(&psock->parser.strp);
+
+	cancel_work_sync(&psock->work);
+
+	psock_progs_drop(&psock->progs);
+
+	sk_psock_link_destroy(psock);
+	sk_psock_cork_free(psock);
+	sk_psock_zap_ingress(psock);
+
+	if (psock->sk_redir)
+		sock_put(psock->sk_redir);
+	sock_put(psock->sk);
+	kfree(psock);
+}
+
+void sk_psock_destroy(struct rcu_head *rcu)
+{
+	struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
+
+	INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
+	schedule_work(&psock->gc);
+}
+EXPORT_SYMBOL_GPL(sk_psock_destroy);
+
+void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
+{
+	rcu_assign_sk_user_data(sk, NULL);
+	sk_psock_cork_free(psock);
+	sk_psock_restore_proto(sk, psock);
+
+	write_lock_bh(&sk->sk_callback_lock);
+	if (psock->progs.skb_parser)
+		sk_psock_stop_strp(sk, psock);
+	write_unlock_bh(&sk->sk_callback_lock);
+	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+
+	call_rcu_sched(&psock->rcu, sk_psock_destroy);
+}
+EXPORT_SYMBOL_GPL(sk_psock_drop);
+
+static int sk_psock_map_verd(int verdict, bool redir)
+{
+	switch (verdict) {
+	case SK_PASS:
+		return redir ? __SK_REDIRECT : __SK_PASS;
+	case SK_DROP:
+	default:
+		break;
+	}
+
+	return __SK_DROP;
+}
+
+int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
+			 struct sk_msg *msg)
+{
+	struct bpf_prog *prog;
+	int ret;
+
+	preempt_disable();
+	rcu_read_lock();
+	prog = READ_ONCE(psock->progs.msg_parser);
+	if (unlikely(!prog)) {
+		ret = __SK_PASS;
+		goto out;
+	}
+
+	sk_msg_compute_data_pointers(msg);
+	msg->sk = sk;
+	ret = BPF_PROG_RUN(prog, msg);
+	ret = sk_psock_map_verd(ret, msg->sk_redir);
+	psock->apply_bytes = msg->apply_bytes;
+	if (ret == __SK_REDIRECT) {
+		if (psock->sk_redir)
+			sock_put(psock->sk_redir);
+		psock->sk_redir = msg->sk_redir;
+		if (!psock->sk_redir) {
+			ret = __SK_DROP;
+			goto out;
+		}
+		sock_hold(psock->sk_redir);
+	}
+out:
+	rcu_read_unlock();
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
+
+static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
+			    struct sk_buff *skb)
+{
+	int ret;
+
+	skb->sk = psock->sk;
+	bpf_compute_data_end_sk_skb(skb);
+	preempt_disable();
+	ret = BPF_PROG_RUN(prog, skb);
+	preempt_enable();
+	/* strparser clones the skb before handing it to a upper layer,
+	 * meaning skb_orphan has been called. We NULL sk on the way out
+	 * to ensure we don't trigger a BUG_ON() in skb/sk operations
+	 * later and because we are not charging the memory of this skb
+	 * to any socket yet.
+	 */
+	skb->sk = NULL;
+	return ret;
+}
+
+static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
+{
+	struct sk_psock_parser *parser;
+
+	parser = container_of(strp, struct sk_psock_parser, strp);
+	return container_of(parser, struct sk_psock, parser);
+}
+
+static void sk_psock_verdict_apply(struct sk_psock *psock,
+				   struct sk_buff *skb, int verdict)
+{
+	struct sk_psock *psock_other;
+	struct sock *sk_other;
+	bool ingress;
+
+	switch (verdict) {
+	case __SK_REDIRECT:
+		sk_other = tcp_skb_bpf_redirect_fetch(skb);
+		if (unlikely(!sk_other))
+			goto out_free;
+		psock_other = sk_psock(sk_other);
+		if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
+		    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
+			goto out_free;
+		ingress = tcp_skb_bpf_ingress(skb);
+		if ((!ingress && sock_writeable(sk_other)) ||
+		    (ingress &&
+		     atomic_read(&sk_other->sk_rmem_alloc) <=
+		     sk_other->sk_rcvbuf)) {
+			if (!ingress)
+				skb_set_owner_w(skb, sk_other);
+			skb_queue_tail(&psock_other->ingress_skb, skb);
+			schedule_work(&psock_other->work);
+			break;
+		}
+		/* fall-through */
+	case __SK_DROP:
+		/* fall-through */
+	default:
+out_free:
+		kfree_skb(skb);
+	}
+}
+
+static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
+{
+	struct sk_psock *psock = sk_psock_from_strp(strp);
+	struct bpf_prog *prog;
+	int ret = __SK_DROP;
+
+	rcu_read_lock();
+	prog = READ_ONCE(psock->progs.skb_verdict);
+	if (likely(prog)) {
+		skb_orphan(skb);
+		tcp_skb_bpf_redirect_clear(skb);
+		ret = sk_psock_bpf_run(psock, prog, skb);
+		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+	}
+	rcu_read_unlock();
+	sk_psock_verdict_apply(psock, skb, ret);
+}
+
+static int sk_psock_strp_read_done(struct strparser *strp, int err)
+{
+	return err;
+}
+
+static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
+{
+	struct sk_psock *psock = sk_psock_from_strp(strp);
+	struct bpf_prog *prog;
+	int ret = skb->len;
+
+	rcu_read_lock();
+	prog = READ_ONCE(psock->progs.skb_parser);
+	if (likely(prog))
+		ret = sk_psock_bpf_run(psock, prog, skb);
+	rcu_read_unlock();
+	return ret;
+}
+
+/* Called with socket lock held. */
+static void sk_psock_data_ready(struct sock *sk)
+{
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (likely(psock)) {
+		write_lock_bh(&sk->sk_callback_lock);
+		strp_data_ready(&psock->parser.strp);
+		write_unlock_bh(&sk->sk_callback_lock);
+	}
+	rcu_read_unlock();
+}
+
+static void sk_psock_write_space(struct sock *sk)
+{
+	struct sk_psock *psock;
+	void (*write_space)(struct sock *sk);
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)))
+		schedule_work(&psock->work);
+	write_space = psock->saved_write_space;
+	rcu_read_unlock();
+	write_space(sk);
+}
+
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+	static const struct strp_callbacks cb = {
+		.rcv_msg	= sk_psock_strp_read,
+		.read_sock_done	= sk_psock_strp_read_done,
+		.parse_msg	= sk_psock_strp_parse,
+	};
+
+	psock->parser.enabled = false;
+	return strp_init(&psock->parser.strp, sk, &cb);
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+	struct sk_psock_parser *parser = &psock->parser;
+
+	if (parser->enabled)
+		return;
+
+	parser->saved_data_ready = sk->sk_data_ready;
+	sk->sk_data_ready = sk_psock_data_ready;
+	sk->sk_write_space = sk_psock_write_space;
+	parser->enabled = true;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+	struct sk_psock_parser *parser = &psock->parser;
+
+	if (!parser->enabled)
+		return;
+
+	sk->sk_data_ready = parser->saved_data_ready;
+	parser->saved_data_ready = NULL;
+	strp_stop(&parser->strp);
+	parser->enabled = false;
+}
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
new file mode 100644
index 000000000000..3c0e44cb811a
--- /dev/null
+++ b/net/core/sock_map.c
@@ -0,0 +1,1002 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <linux/skmsg.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+
+struct bpf_stab {
+	struct bpf_map map;
+	struct sock **sks;
+	struct sk_psock_progs progs;
+	raw_spinlock_t lock;
+};
+
+#define SOCK_CREATE_FLAG_MASK				\
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_stab *stab;
+	u64 cost;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+	if (attr->max_entries == 0 ||
+	    attr->key_size    != 4 ||
+	    attr->value_size  != 4 ||
+	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+
+	stab = kzalloc(sizeof(*stab), GFP_USER);
+	if (!stab)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&stab->map, attr);
+	raw_spin_lock_init(&stab->lock);
+
+	/* Make sure page count doesn't overflow. */
+	cost = (u64) stab->map.max_entries * sizeof(struct sock *);
+	if (cost >= U32_MAX - PAGE_SIZE) {
+		err = -EINVAL;
+		goto free_stab;
+	}
+
+	stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	err = bpf_map_precharge_memlock(stab->map.pages);
+	if (err)
+		goto free_stab;
+
+	stab->sks = bpf_map_area_alloc(stab->map.max_entries *
+				       sizeof(struct sock *),
+				       stab->map.numa_node);
+	if (stab->sks)
+		return &stab->map;
+	err = -ENOMEM;
+free_stab:
+	kfree(stab);
+	return ERR_PTR(err);
+}
+
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	u32 ufd = attr->target_fd;
+	struct bpf_map *map;
+	struct fd f;
+	int ret;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	ret = sock_map_prog_update(map, prog, attr->attach_type);
+	fdput(f);
+	return ret;
+}
+
+static void sock_map_sk_acquire(struct sock *sk)
+	__acquires(&sk->sk_lock.slock)
+{
+	lock_sock(sk);
+	preempt_disable();
+	rcu_read_lock();
+}
+
+static void sock_map_sk_release(struct sock *sk)
+	__releases(&sk->sk_lock.slock)
+{
+	rcu_read_unlock();
+	preempt_enable();
+	release_sock(sk);
+}
+
+static void sock_map_add_link(struct sk_psock *psock,
+			      struct sk_psock_link *link,
+			      struct bpf_map *map, void *link_raw)
+{
+	link->link_raw = link_raw;
+	link->map = map;
+	spin_lock_bh(&psock->link_lock);
+	list_add_tail(&link->list, &psock->link);
+	spin_unlock_bh(&psock->link_lock);
+}
+
+static void sock_map_del_link(struct sock *sk,
+			      struct sk_psock *psock, void *link_raw)
+{
+	struct sk_psock_link *link, *tmp;
+	bool strp_stop = false;
+
+	spin_lock_bh(&psock->link_lock);
+	list_for_each_entry_safe(link, tmp, &psock->link, list) {
+		if (link->link_raw == link_raw) {
+			struct bpf_map *map = link->map;
+			struct bpf_stab *stab = container_of(map, struct bpf_stab,
+							     map);
+			if (psock->parser.enabled && stab->progs.skb_parser)
+				strp_stop = true;
+			list_del(&link->list);
+			sk_psock_free_link(link);
+		}
+	}
+	spin_unlock_bh(&psock->link_lock);
+	if (strp_stop) {
+		write_lock_bh(&sk->sk_callback_lock);
+		sk_psock_stop_strp(sk, psock);
+		write_unlock_bh(&sk->sk_callback_lock);
+	}
+}
+
+static void sock_map_unref(struct sock *sk, void *link_raw)
+{
+	struct sk_psock *psock = sk_psock(sk);
+
+	if (likely(psock)) {
+		sock_map_del_link(sk, psock, link_raw);
+		sk_psock_put(sk, psock);
+	}
+}
+
+static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
+			 struct sock *sk)
+{
+	struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
+	bool skb_progs, sk_psock_is_new = false;
+	struct sk_psock *psock;
+	int ret;
+
+	skb_verdict = READ_ONCE(progs->skb_verdict);
+	skb_parser = READ_ONCE(progs->skb_parser);
+	skb_progs = skb_parser && skb_verdict;
+	if (skb_progs) {
+		skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+		if (IS_ERR(skb_verdict))
+			return PTR_ERR(skb_verdict);
+		skb_parser = bpf_prog_inc_not_zero(skb_parser);
+		if (IS_ERR(skb_parser)) {
+			bpf_prog_put(skb_verdict);
+			return PTR_ERR(skb_parser);
+		}
+	}
+
+	msg_parser = READ_ONCE(progs->msg_parser);
+	if (msg_parser) {
+		msg_parser = bpf_prog_inc_not_zero(msg_parser);
+		if (IS_ERR(msg_parser)) {
+			ret = PTR_ERR(msg_parser);
+			goto out;
+		}
+	}
+
+	psock = sk_psock_get(sk);
+	if (psock) {
+		if (!sk_has_psock(sk)) {
+			ret = -EBUSY;
+			goto out_progs;
+		}
+		if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
+		    (skb_progs  && READ_ONCE(psock->progs.skb_parser))) {
+			sk_psock_put(sk, psock);
+			ret = -EBUSY;
+			goto out_progs;
+		}
+	} else {
+		psock = sk_psock_init(sk, map->numa_node);
+		if (!psock) {
+			ret = -ENOMEM;
+			goto out_progs;
+		}
+		sk_psock_is_new = true;
+	}
+
+	if (msg_parser)
+		psock_set_prog(&psock->progs.msg_parser, msg_parser);
+	if (sk_psock_is_new) {
+		ret = tcp_bpf_init(sk);
+		if (ret < 0)
+			goto out_drop;
+	} else {
+		tcp_bpf_reinit(sk);
+	}
+
+	write_lock_bh(&sk->sk_callback_lock);
+	if (skb_progs && !psock->parser.enabled) {
+		ret = sk_psock_init_strp(sk, psock);
+		if (ret) {
+			write_unlock_bh(&sk->sk_callback_lock);
+			goto out_drop;
+		}
+		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+		psock_set_prog(&psock->progs.skb_parser, skb_parser);
+		sk_psock_start_strp(sk, psock);
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+	return 0;
+out_drop:
+	sk_psock_put(sk, psock);
+out_progs:
+	if (msg_parser)
+		bpf_prog_put(msg_parser);
+out:
+	if (skb_progs) {
+		bpf_prog_put(skb_verdict);
+		bpf_prog_put(skb_parser);
+	}
+	return ret;
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	int i;
+
+	synchronize_rcu();
+	rcu_read_lock();
+	raw_spin_lock_bh(&stab->lock);
+	for (i = 0; i < stab->map.max_entries; i++) {
+		struct sock **psk = &stab->sks[i];
+		struct sock *sk;
+
+		sk = xchg(psk, NULL);
+		if (sk)
+			sock_map_unref(sk, psk);
+	}
+	raw_spin_unlock_bh(&stab->lock);
+	rcu_read_unlock();
+
+	bpf_map_area_free(stab->sks);
+	kfree(stab);
+}
+
+static void sock_map_release_progs(struct bpf_map *map)
+{
+	psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs);
+}
+
+static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (unlikely(key >= map->max_entries))
+		return NULL;
+	return READ_ONCE(stab->sks[key]);
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
+			     struct sock **psk)
+{
+	struct sock *sk;
+
+	raw_spin_lock_bh(&stab->lock);
+	sk = *psk;
+	if (!sk_test || sk_test == sk)
+		*psk = NULL;
+	raw_spin_unlock_bh(&stab->lock);
+	if (unlikely(!sk))
+		return -EINVAL;
+	sock_map_unref(sk, psk);
+	return 0;
+}
+
+static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
+				      void *link_raw)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+	__sock_map_delete(stab, sk, link_raw);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	u32 i = *(u32 *)key;
+	struct sock **psk;
+
+	if (unlikely(i >= map->max_entries))
+		return -EINVAL;
+
+	psk = &stab->sks[i];
+	return __sock_map_delete(stab, NULL, psk);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	u32 i = key ? *(u32 *)key : U32_MAX;
+	u32 *key_next = next;
+
+	if (i == stab->map.max_entries - 1)
+		return -ENOENT;
+	if (i >= stab->map.max_entries)
+		*key_next = 0;
+	else
+		*key_next = i + 1;
+	return 0;
+}
+
+static int sock_map_update_common(struct bpf_map *map, u32 idx,
+				  struct sock *sk, u64 flags)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct sk_psock_link *link;
+	struct sk_psock *psock;
+	struct sock *osk;
+	int ret;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(idx >= map->max_entries))
+		return -E2BIG;
+
+	link = sk_psock_init_link();
+	if (!link)
+		return -ENOMEM;
+
+	ret = sock_map_link(map, &stab->progs, sk);
+	if (ret < 0)
+		goto out_free;
+
+	psock = sk_psock(sk);
+	WARN_ON_ONCE(!psock);
+
+	raw_spin_lock_bh(&stab->lock);
+	osk = stab->sks[idx];
+	if (osk && flags == BPF_NOEXIST) {
+		ret = -EEXIST;
+		goto out_unlock;
+	} else if (!osk && flags == BPF_EXIST) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	sock_map_add_link(psock, link, map, &stab->sks[idx]);
+	stab->sks[idx] = sk;
+	if (osk)
+		sock_map_unref(osk, &stab->sks[idx]);
+	raw_spin_unlock_bh(&stab->lock);
+	return 0;
+out_unlock:
+	raw_spin_unlock_bh(&stab->lock);
+	if (psock)
+		sk_psock_put(sk, psock);
+out_free:
+	sk_psock_free_link(link);
+	return ret;
+}
+
+static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
+{
+	return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+	       ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+}
+
+static bool sock_map_sk_is_suitable(const struct sock *sk)
+{
+	return sk->sk_type == SOCK_STREAM &&
+	       sk->sk_protocol == IPPROTO_TCP;
+}
+
+static int sock_map_update_elem(struct bpf_map *map, void *key,
+				void *value, u64 flags)
+{
+	u32 ufd = *(u32 *)value;
+	u32 idx = *(u32 *)key;
+	struct socket *sock;
+	struct sock *sk;
+	int ret;
+
+	sock = sockfd_lookup(ufd, &ret);
+	if (!sock)
+		return ret;
+	sk = sock->sk;
+	if (!sk) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!sock_map_sk_is_suitable(sk) ||
+	    sk->sk_state != TCP_ESTABLISHED) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	sock_map_sk_acquire(sk);
+	ret = sock_map_update_common(map, idx, sk, flags);
+	sock_map_sk_release(sk);
+out:
+	fput(sock->file);
+	return ret;
+}
+
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (likely(sock_map_sk_is_suitable(sops->sk) &&
+		   sock_map_op_okay(sops)))
+		return sock_map_update_common(map, *(u32 *)key, sops->sk,
+					      flags);
+	return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+	.func		= bpf_sock_map_update,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
+	   struct bpf_map *, map, u32, key, u64, flags)
+{
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+	tcb->bpf.flags = flags;
+	tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+	if (!tcb->bpf.sk_redir)
+		return SK_DROP;
+	return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+	.func           = bpf_sk_redirect_map,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_ANYTHING,
+	.arg4_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
+	   struct bpf_map *, map, u32, key, u64, flags)
+{
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+	msg->flags = flags;
+	msg->sk_redir = __sock_map_lookup_elem(map, key);
+	if (!msg->sk_redir)
+		return SK_DROP;
+	return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+	.func           = bpf_msg_redirect_map,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_ANYTHING,
+	.arg4_type      = ARG_ANYTHING,
+};
+
+const struct bpf_map_ops sock_map_ops = {
+	.map_alloc		= sock_map_alloc,
+	.map_free		= sock_map_free,
+	.map_get_next_key	= sock_map_get_next_key,
+	.map_update_elem	= sock_map_update_elem,
+	.map_delete_elem	= sock_map_delete_elem,
+	.map_lookup_elem	= sock_map_lookup,
+	.map_release_uref	= sock_map_release_progs,
+	.map_check_btf		= map_check_no_btf,
+};
+
+struct bpf_htab_elem {
+	struct rcu_head rcu;
+	u32 hash;
+	struct sock *sk;
+	struct hlist_node node;
+	u8 key[0];
+};
+
+struct bpf_htab_bucket {
+	struct hlist_head head;
+	raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+	struct bpf_map map;
+	struct bpf_htab_bucket *buckets;
+	u32 buckets_num;
+	u32 elem_size;
+	struct sk_psock_progs progs;
+	atomic_t count;
+};
+
+static inline u32 sock_hash_bucket_hash(const void *key, u32 len)
+{
+	return jhash(key, len, 0);
+}
+
+static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab,
+						       u32 hash)
+{
+	return &htab->buckets[hash & (htab->buckets_num - 1)];
+}
+
+static struct bpf_htab_elem *
+sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key,
+			  u32 key_size)
+{
+	struct bpf_htab_elem *elem;
+
+	hlist_for_each_entry_rcu(elem, head, node) {
+		if (elem->hash == hash &&
+		    !memcmp(&elem->key, key, key_size))
+			return elem;
+	}
+
+	return NULL;
+}
+
+static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	u32 key_size = map->key_size, hash;
+	struct bpf_htab_bucket *bucket;
+	struct bpf_htab_elem *elem;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	hash = sock_hash_bucket_hash(key, key_size);
+	bucket = sock_hash_select_bucket(htab, hash);
+	elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+
+	return elem ? elem->sk : NULL;
+}
+
+static void sock_hash_free_elem(struct bpf_htab *htab,
+				struct bpf_htab_elem *elem)
+{
+	atomic_dec(&htab->count);
+	kfree_rcu(elem, rcu);
+}
+
+static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
+				       void *link_raw)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct bpf_htab_elem *elem_probe, *elem = link_raw;
+	struct bpf_htab_bucket *bucket;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	bucket = sock_hash_select_bucket(htab, elem->hash);
+
+	/* elem may be deleted in parallel from the map, but access here
+	 * is okay since it's going away only after RCU grace period.
+	 * However, we need to check whether it's still present.
+	 */
+	raw_spin_lock_bh(&bucket->lock);
+	elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash,
+					       elem->key, map->key_size);
+	if (elem_probe && elem_probe == elem) {
+		hlist_del_rcu(&elem->node);
+		sock_map_unref(elem->sk, elem);
+		sock_hash_free_elem(htab, elem);
+	}
+	raw_spin_unlock_bh(&bucket->lock);
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	u32 hash, key_size = map->key_size;
+	struct bpf_htab_bucket *bucket;
+	struct bpf_htab_elem *elem;
+	int ret = -ENOENT;
+
+	hash = sock_hash_bucket_hash(key, key_size);
+	bucket = sock_hash_select_bucket(htab, hash);
+
+	raw_spin_lock_bh(&bucket->lock);
+	elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+	if (elem) {
+		hlist_del_rcu(&elem->node);
+		sock_map_unref(elem->sk, elem);
+		sock_hash_free_elem(htab, elem);
+		ret = 0;
+	}
+	raw_spin_unlock_bh(&bucket->lock);
+	return ret;
+}
+
+static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab,
+						  void *key, u32 key_size,
+						  u32 hash, struct sock *sk,
+						  struct bpf_htab_elem *old)
+{
+	struct bpf_htab_elem *new;
+
+	if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+		if (!old) {
+			atomic_dec(&htab->count);
+			return ERR_PTR(-E2BIG);
+		}
+	}
+
+	new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+			   htab->map.numa_node);
+	if (!new) {
+		atomic_dec(&htab->count);
+		return ERR_PTR(-ENOMEM);
+	}
+	memcpy(new->key, key, key_size);
+	new->sk = sk;
+	new->hash = hash;
+	return new;
+}
+
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+				   struct sock *sk, u64 flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	u32 key_size = map->key_size, hash;
+	struct bpf_htab_elem *elem, *elem_new;
+	struct bpf_htab_bucket *bucket;
+	struct sk_psock_link *link;
+	struct sk_psock *psock;
+	int ret;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	link = sk_psock_init_link();
+	if (!link)
+		return -ENOMEM;
+
+	ret = sock_map_link(map, &htab->progs, sk);
+	if (ret < 0)
+		goto out_free;
+
+	psock = sk_psock(sk);
+	WARN_ON_ONCE(!psock);
+
+	hash = sock_hash_bucket_hash(key, key_size);
+	bucket = sock_hash_select_bucket(htab, hash);
+
+	raw_spin_lock_bh(&bucket->lock);
+	elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+	if (elem && flags == BPF_NOEXIST) {
+		ret = -EEXIST;
+		goto out_unlock;
+	} else if (!elem && flags == BPF_EXIST) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem);
+	if (IS_ERR(elem_new)) {
+		ret = PTR_ERR(elem_new);
+		goto out_unlock;
+	}
+
+	sock_map_add_link(psock, link, map, elem_new);
+	/* Add new element to the head of the list, so that
+	 * concurrent search will find it before old elem.
+	 */
+	hlist_add_head_rcu(&elem_new->node, &bucket->head);
+	if (elem) {
+		hlist_del_rcu(&elem->node);
+		sock_map_unref(elem->sk, elem);
+		sock_hash_free_elem(htab, elem);
+	}
+	raw_spin_unlock_bh(&bucket->lock);
+	return 0;
+out_unlock:
+	raw_spin_unlock_bh(&bucket->lock);
+	sk_psock_put(sk, psock);
+out_free:
+	sk_psock_free_link(link);
+	return ret;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map, void *key,
+				 void *value, u64 flags)
+{
+	u32 ufd = *(u32 *)value;
+	struct socket *sock;
+	struct sock *sk;
+	int ret;
+
+	sock = sockfd_lookup(ufd, &ret);
+	if (!sock)
+		return ret;
+	sk = sock->sk;
+	if (!sk) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!sock_map_sk_is_suitable(sk) ||
+	    sk->sk_state != TCP_ESTABLISHED) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	sock_map_sk_acquire(sk);
+	ret = sock_hash_update_common(map, key, sk, flags);
+	sock_map_sk_release(sk);
+out:
+	fput(sock->file);
+	return ret;
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map, void *key,
+				  void *key_next)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct bpf_htab_elem *elem, *elem_next;
+	u32 hash, key_size = map->key_size;
+	struct hlist_head *head;
+	int i = 0;
+
+	if (!key)
+		goto find_first_elem;
+	hash = sock_hash_bucket_hash(key, key_size);
+	head = &sock_hash_select_bucket(htab, hash)->head;
+	elem = sock_hash_lookup_elem_raw(head, hash, key, key_size);
+	if (!elem)
+		goto find_first_elem;
+
+	elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)),
+				     struct bpf_htab_elem, node);
+	if (elem_next) {
+		memcpy(key_next, elem_next->key, key_size);
+		return 0;
+	}
+
+	i = hash & (htab->buckets_num - 1);
+	i++;
+find_first_elem:
+	for (; i < htab->buckets_num; i++) {
+		head = &sock_hash_select_bucket(htab, i)->head;
+		elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+					     struct bpf_htab_elem, node);
+		if (elem_next) {
+			memcpy(key_next, elem_next->key, key_size);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+	struct bpf_htab *htab;
+	int i, err;
+	u64 cost;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+	if (attr->max_entries == 0 ||
+	    attr->key_size    == 0 ||
+	    attr->value_size  != 4 ||
+	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+	if (attr->key_size > MAX_BPF_STACK)
+		return ERR_PTR(-E2BIG);
+
+	htab = kzalloc(sizeof(*htab), GFP_USER);
+	if (!htab)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&htab->map, attr);
+
+	htab->buckets_num = roundup_pow_of_two(htab->map.max_entries);
+	htab->elem_size = sizeof(struct bpf_htab_elem) +
+			  round_up(htab->map.key_size, 8);
+	if (htab->buckets_num == 0 ||
+	    htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) {
+		err = -EINVAL;
+		goto free_htab;
+	}
+
+	cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) +
+	       (u64) htab->elem_size * htab->map.max_entries;
+	if (cost >= U32_MAX - PAGE_SIZE) {
+		err = -EINVAL;
+		goto free_htab;
+	}
+
+	htab->buckets = bpf_map_area_alloc(htab->buckets_num *
+					   sizeof(struct bpf_htab_bucket),
+					   htab->map.numa_node);
+	if (!htab->buckets) {
+		err = -ENOMEM;
+		goto free_htab;
+	}
+
+	for (i = 0; i < htab->buckets_num; i++) {
+		INIT_HLIST_HEAD(&htab->buckets[i].head);
+		raw_spin_lock_init(&htab->buckets[i].lock);
+	}
+
+	return &htab->map;
+free_htab:
+	kfree(htab);
+	return ERR_PTR(err);
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct bpf_htab_bucket *bucket;
+	struct bpf_htab_elem *elem;
+	struct hlist_node *node;
+	int i;
+
+	synchronize_rcu();
+	rcu_read_lock();
+	for (i = 0; i < htab->buckets_num; i++) {
+		bucket = sock_hash_select_bucket(htab, i);
+		raw_spin_lock_bh(&bucket->lock);
+		hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
+			hlist_del_rcu(&elem->node);
+			sock_map_unref(elem->sk, elem);
+		}
+		raw_spin_unlock_bh(&bucket->lock);
+	}
+	rcu_read_unlock();
+
+	bpf_map_area_free(htab->buckets);
+	kfree(htab);
+}
+
+static void sock_hash_release_progs(struct bpf_map *map)
+{
+	psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs);
+}
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (likely(sock_map_sk_is_suitable(sops->sk) &&
+		   sock_map_op_okay(sops)))
+		return sock_hash_update_common(map, key, sops->sk, flags);
+	return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+	.func		= bpf_sock_hash_update,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+	tcb->bpf.flags = flags;
+	tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!tcb->bpf.sk_redir)
+		return SK_DROP;
+	return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+	.func           = bpf_sk_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+	msg->flags = flags;
+	msg->sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!msg->sk_redir)
+		return SK_DROP;
+	return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+	.func           = bpf_msg_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
+const struct bpf_map_ops sock_hash_ops = {
+	.map_alloc		= sock_hash_alloc,
+	.map_free		= sock_hash_free,
+	.map_get_next_key	= sock_hash_get_next_key,
+	.map_update_elem	= sock_hash_update_elem,
+	.map_delete_elem	= sock_hash_delete_elem,
+	.map_lookup_elem	= sock_map_lookup,
+	.map_release_uref	= sock_hash_release_progs,
+	.map_check_btf		= map_check_no_btf,
+};
+
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
+{
+	switch (map->map_type) {
+	case BPF_MAP_TYPE_SOCKMAP:
+		return &container_of(map, struct bpf_stab, map)->progs;
+	case BPF_MAP_TYPE_SOCKHASH:
+		return &container_of(map, struct bpf_htab, map)->progs;
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
+int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+			 u32 which)
+{
+	struct sk_psock_progs *progs = sock_map_progs(map);
+
+	if (!progs)
+		return -EOPNOTSUPP;
+
+	switch (which) {
+	case BPF_SK_MSG_VERDICT:
+		psock_set_prog(&progs->msg_parser, prog);
+		break;
+	case BPF_SK_SKB_STREAM_PARSER:
+		psock_set_prog(&progs->skb_parser, prog);
+		break;
+	case BPF_SK_SKB_STREAM_VERDICT:
+		psock_set_prog(&progs->skb_verdict, prog);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
+{
+	switch (link->map->map_type) {
+	case BPF_MAP_TYPE_SOCKMAP:
+		return sock_map_delete_from_link(link->map, sk,
+						 link->link_raw);
+	case BPF_MAP_TYPE_SOCKHASH:
+		return sock_hash_delete_from_link(link->map, sk,
+						  link->link_raw);
+	default:
+		break;
+	}
+}
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7446b98661d8..58629314eae9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
new file mode 100644
index 000000000000..80debb0daf37
--- /dev/null
+++ b/net/ipv4/tcp_bpf.c
@@ -0,0 +1,655 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+
+#include <net/inet_common.h>
+
+static bool tcp_bpf_stream_read(const struct sock *sk)
+{
+	struct sk_psock *psock;
+	bool empty = true;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (likely(psock))
+		empty = list_empty(&psock->ingress_msg);
+	rcu_read_unlock();
+	return !empty;
+}
+
+static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
+			     int flags, long timeo, int *err)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int ret;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	ret = sk_wait_event(sk, &timeo,
+			    !list_empty(&psock->ingress_msg) ||
+			    !skb_queue_empty(&sk->sk_receive_queue), &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
+		      struct msghdr *msg, int len)
+{
+	struct iov_iter *iter = &msg->msg_iter;
+	int i, ret, copied = 0;
+
+	while (copied != len) {
+		struct scatterlist *sge;
+		struct sk_msg *msg_rx;
+
+		msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+						  struct sk_msg, list);
+		if (unlikely(!msg_rx))
+			break;
+
+		i = msg_rx->sg.start;
+		do {
+			struct page *page;
+			int copy;
+
+			sge = sk_msg_elem(msg_rx, i);
+			copy = sge->length;
+			page = sg_page(sge);
+			if (copied + copy > len)
+				copy = len - copied;
+			ret = copy_page_to_iter(page, sge->offset, copy, iter);
+			if (ret != copy) {
+				msg_rx->sg.start = i;
+				return -EFAULT;
+			}
+
+			copied += copy;
+			sge->offset += copy;
+			sge->length -= copy;
+			sk_mem_uncharge(sk, copy);
+			if (!sge->length) {
+				i++;
+				if (i == MAX_SKB_FRAGS)
+					i = 0;
+				if (!msg_rx->skb)
+					put_page(page);
+			}
+
+			if (copied == len)
+				break;
+		} while (i != msg_rx->sg.end);
+
+		msg_rx->sg.start = i;
+		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+			list_del(&msg_rx->list);
+			if (msg_rx->skb)
+				consume_skb(msg_rx->skb);
+			kfree(msg_rx);
+		}
+	}
+
+	return copied;
+}
+EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
+
+int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+		    int nonblock, int flags, int *addr_len)
+{
+	struct sk_psock *psock;
+	int copied, ret;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return inet_recv_error(sk, msg, len, addr_len);
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+	lock_sock(sk);
+msg_bytes_ready:
+	copied = __tcp_bpf_recvmsg(sk, psock, msg, len);
+	if (!copied) {
+		int data, err = 0;
+		long timeo;
+
+		timeo = sock_rcvtimeo(sk, nonblock);
+		data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+		if (data) {
+			if (skb_queue_empty(&sk->sk_receive_queue))
+				goto msg_bytes_ready;
+			release_sock(sk);
+			sk_psock_put(sk, psock);
+			return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+		}
+		if (err) {
+			ret = err;
+			goto out;
+		}
+	}
+	ret = copied;
+out:
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return ret;
+}
+
+static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
+			   struct sk_msg *msg, u32 apply_bytes, int flags)
+{
+	bool apply = apply_bytes;
+	struct scatterlist *sge;
+	u32 size, copied = 0;
+	struct sk_msg *tmp;
+	int i, ret = 0;
+
+	tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
+	if (unlikely(!tmp))
+		return -ENOMEM;
+
+	lock_sock(sk);
+	tmp->sg.start = msg->sg.start;
+	i = msg->sg.start;
+	do {
+		sge = sk_msg_elem(msg, i);
+		size = (apply && apply_bytes < sge->length) ?
+			apply_bytes : sge->length;
+		if (!sk_wmem_schedule(sk, size)) {
+			if (!copied)
+				ret = -ENOMEM;
+			break;
+		}
+
+		sk_mem_charge(sk, size);
+		sk_msg_xfer(tmp, msg, i, size);
+		copied += size;
+		if (sge->length)
+			get_page(sk_msg_page(tmp, i));
+		sk_msg_iter_var_next(i);
+		tmp->sg.end = i;
+		if (apply) {
+			apply_bytes -= size;
+			if (!apply_bytes)
+				break;
+		}
+	} while (i != msg->sg.end);
+
+	if (!ret) {
+		msg->sg.start = i;
+		msg->sg.size -= apply_bytes;
+		sk_psock_queue_msg(psock, tmp);
+		sk->sk_data_ready(sk);
+	} else {
+		sk_msg_free(sk, tmp);
+		kfree(tmp);
+	}
+
+	release_sock(sk);
+	return ret;
+}
+
+static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
+			int flags, bool uncharge)
+{
+	bool apply = apply_bytes;
+	struct scatterlist *sge;
+	struct page *page;
+	int size, ret = 0;
+	u32 off;
+
+	while (1) {
+		sge = sk_msg_elem(msg, msg->sg.start);
+		size = (apply && apply_bytes < sge->length) ?
+			apply_bytes : sge->length;
+		off  = sge->offset;
+		page = sg_page(sge);
+
+		tcp_rate_check_app_limited(sk);
+retry:
+		ret = do_tcp_sendpages(sk, page, off, size, flags);
+		if (ret <= 0)
+			return ret;
+		if (apply)
+			apply_bytes -= ret;
+		msg->sg.size -= ret;
+		sge->offset += ret;
+		sge->length -= ret;
+		if (uncharge)
+			sk_mem_uncharge(sk, ret);
+		if (ret != size) {
+			size -= ret;
+			off  += ret;
+			goto retry;
+		}
+		if (!sge->length) {
+			put_page(page);
+			sk_msg_iter_next(msg, start);
+			sg_init_table(sge, 1);
+			if (msg->sg.start == msg->sg.end)
+				break;
+		}
+		if (apply && !apply_bytes)
+			break;
+	}
+
+	return 0;
+}
+
+static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
+			       u32 apply_bytes, int flags, bool uncharge)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
+	release_sock(sk);
+	return ret;
+}
+
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
+			  u32 bytes, int flags)
+{
+	bool ingress = sk_msg_to_ingress(msg);
+	struct sk_psock *psock = sk_psock_get(sk);
+	int ret;
+
+	if (unlikely(!psock)) {
+		sk_msg_free(sk, msg);
+		return 0;
+	}
+	ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
+			tcp_bpf_push_locked(sk, msg, bytes, flags, false);
+	sk_psock_put(sk, psock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+
+static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
+				struct sk_msg *msg, int *copied, int flags)
+{
+	bool cork = false, enospc = msg->sg.start == msg->sg.end;
+	struct sock *sk_redir;
+	u32 tosend;
+	int ret;
+
+more_data:
+	if (psock->eval == __SK_NONE)
+		psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+
+	if (msg->cork_bytes &&
+	    msg->cork_bytes > msg->sg.size && !enospc) {
+		psock->cork_bytes = msg->cork_bytes - msg->sg.size;
+		if (!psock->cork) {
+			psock->cork = kzalloc(sizeof(*psock->cork),
+					      GFP_ATOMIC | __GFP_NOWARN);
+			if (!psock->cork)
+				return -ENOMEM;
+		}
+		memcpy(psock->cork, msg, sizeof(*msg));
+		return 0;
+	}
+
+	tosend = msg->sg.size;
+	if (psock->apply_bytes && psock->apply_bytes < tosend)
+		tosend = psock->apply_bytes;
+
+	switch (psock->eval) {
+	case __SK_PASS:
+		ret = tcp_bpf_push(sk, msg, tosend, flags, true);
+		if (unlikely(ret)) {
+			*copied -= sk_msg_free(sk, msg);
+			break;
+		}
+		sk_msg_apply_bytes(psock, tosend);
+		break;
+	case __SK_REDIRECT:
+		sk_redir = psock->sk_redir;
+		sk_msg_apply_bytes(psock, tosend);
+		if (psock->cork) {
+			cork = true;
+			psock->cork = NULL;
+		}
+		sk_msg_return(sk, msg, tosend);
+		release_sock(sk);
+		ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+		lock_sock(sk);
+		if (unlikely(ret < 0)) {
+			int free = sk_msg_free_nocharge(sk, msg);
+
+			if (!cork)
+				*copied -= free;
+		}
+		if (cork) {
+			sk_msg_free(sk, msg);
+			kfree(msg);
+			msg = NULL;
+			ret = 0;
+		}
+		break;
+	case __SK_DROP:
+	default:
+		sk_msg_free_partial(sk, msg, tosend);
+		sk_msg_apply_bytes(psock, tosend);
+		*copied -= tosend;
+		return -EACCES;
+	}
+
+	if (likely(!ret)) {
+		if (!psock->apply_bytes) {
+			psock->eval =  __SK_NONE;
+			if (psock->sk_redir) {
+				sock_put(psock->sk_redir);
+				psock->sk_redir = NULL;
+			}
+		}
+		if (msg &&
+		    msg->sg.data[msg->sg.start].page_link &&
+		    msg->sg.data[msg->sg.start].length)
+			goto more_data;
+	}
+	return ret;
+}
+
+static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	struct sk_msg tmp, *msg_tx = NULL;
+	int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
+	int copied = 0, err = 0;
+	struct sk_psock *psock;
+	long timeo;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return tcp_sendmsg(sk, msg, size);
+
+	lock_sock(sk);
+	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	while (msg_data_left(msg)) {
+		bool enospc = false;
+		u32 copy, osize;
+
+		if (sk->sk_err) {
+			err = -sk->sk_err;
+			goto out_err;
+		}
+
+		copy = msg_data_left(msg);
+		if (!sk_stream_memory_free(sk))
+			goto wait_for_sndbuf;
+		if (psock->cork) {
+			msg_tx = psock->cork;
+		} else {
+			msg_tx = &tmp;
+			sk_msg_init(msg_tx);
+		}
+
+		osize = msg_tx->sg.size;
+		err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
+		if (err) {
+			if (err != -ENOSPC)
+				goto wait_for_memory;
+			enospc = true;
+			copy = msg_tx->sg.size - osize;
+		}
+
+		err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
+					       copy);
+		if (err < 0) {
+			sk_msg_trim(sk, msg_tx, osize);
+			goto out_err;
+		}
+
+		copied += copy;
+		if (psock->cork_bytes) {
+			if (size > psock->cork_bytes)
+				psock->cork_bytes = 0;
+			else
+				psock->cork_bytes -= size;
+			if (psock->cork_bytes && !enospc)
+				goto out_err;
+			/* All cork bytes are accounted, rerun the prog. */
+			psock->eval = __SK_NONE;
+			psock->cork_bytes = 0;
+		}
+
+		err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
+		if (unlikely(err < 0))
+			goto out_err;
+		continue;
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err) {
+			if (msg_tx && msg_tx != psock->cork)
+				sk_msg_free(sk, msg_tx);
+			goto out_err;
+		}
+	}
+out_err:
+	if (err < 0)
+		err = sk_stream_error(sk, msg->msg_flags, err);
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return copied ? copied : err;
+}
+
+static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
+			    size_t size, int flags)
+{
+	struct sk_msg tmp, *msg = NULL;
+	int err = 0, copied = 0;
+	struct sk_psock *psock;
+	bool enospc = false;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return tcp_sendpage(sk, page, offset, size, flags);
+
+	lock_sock(sk);
+	if (psock->cork) {
+		msg = psock->cork;
+	} else {
+		msg = &tmp;
+		sk_msg_init(msg);
+	}
+
+	/* Catch case where ring is full and sendpage is stalled. */
+	if (unlikely(sk_msg_full(msg)))
+		goto out_err;
+
+	sk_msg_page_add(msg, page, size, offset);
+	sk_mem_charge(sk, size);
+	copied = size;
+	if (sk_msg_full(msg))
+		enospc = true;
+	if (psock->cork_bytes) {
+		if (size > psock->cork_bytes)
+			psock->cork_bytes = 0;
+		else
+			psock->cork_bytes -= size;
+		if (psock->cork_bytes && !enospc)
+			goto out_err;
+		/* All cork bytes are accounted, rerun the prog. */
+		psock->eval = __SK_NONE;
+		psock->cork_bytes = 0;
+	}
+
+	err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
+out_err:
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return copied ? copied : err;
+}
+
+static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
+{
+	struct sk_psock_link *link;
+
+	sk_psock_cork_free(psock);
+	__sk_psock_purge_ingress_msg(psock);
+	while ((link = sk_psock_link_pop(psock))) {
+		sk_psock_unlink(sk, link);
+		sk_psock_free_link(link);
+	}
+}
+
+static void tcp_bpf_unhash(struct sock *sk)
+{
+	void (*saved_unhash)(struct sock *sk);
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		if (sk->sk_prot->unhash)
+			sk->sk_prot->unhash(sk);
+		return;
+	}
+
+	saved_unhash = psock->saved_unhash;
+	tcp_bpf_remove(sk, psock);
+	rcu_read_unlock();
+	saved_unhash(sk);
+}
+
+static void tcp_bpf_close(struct sock *sk, long timeout)
+{
+	void (*saved_close)(struct sock *sk, long timeout);
+	struct sk_psock *psock;
+
+	lock_sock(sk);
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		release_sock(sk);
+		return sk->sk_prot->close(sk, timeout);
+	}
+
+	saved_close = psock->saved_close;
+	tcp_bpf_remove(sk, psock);
+	rcu_read_unlock();
+	release_sock(sk);
+	saved_close(sk, timeout);
+}
+
+enum {
+	TCP_BPF_IPV4,
+	TCP_BPF_IPV6,
+	TCP_BPF_NUM_PROTS,
+};
+
+enum {
+	TCP_BPF_BASE,
+	TCP_BPF_TX,
+	TCP_BPF_NUM_CFGS,
+};
+
+static struct proto *tcpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
+
+static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
+				   struct proto *base)
+{
+	prot[TCP_BPF_BASE]			= *base;
+	prot[TCP_BPF_BASE].unhash		= tcp_bpf_unhash;
+	prot[TCP_BPF_BASE].close		= tcp_bpf_close;
+	prot[TCP_BPF_BASE].recvmsg		= tcp_bpf_recvmsg;
+	prot[TCP_BPF_BASE].stream_memory_read	= tcp_bpf_stream_read;
+
+	prot[TCP_BPF_TX]			= prot[TCP_BPF_BASE];
+	prot[TCP_BPF_TX].sendmsg		= tcp_bpf_sendmsg;
+	prot[TCP_BPF_TX].sendpage		= tcp_bpf_sendpage;
+}
+
+static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+{
+	if (sk->sk_family == AF_INET6 &&
+	    unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
+		spin_lock_bh(&tcpv6_prot_lock);
+		if (likely(ops != tcpv6_prot_saved)) {
+			tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
+			smp_store_release(&tcpv6_prot_saved, ops);
+		}
+		spin_unlock_bh(&tcpv6_prot_lock);
+	}
+}
+
+static int __init tcp_bpf_v4_build_proto(void)
+{
+	tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
+	return 0;
+}
+core_initcall(tcp_bpf_v4_build_proto);
+
+static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
+
+	sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
+}
+
+static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
+
+	/* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
+	 * or added requiring sk_prot hook updates. We keep original saved
+	 * hooks in this case.
+	 */
+	sk->sk_prot = &tcp_bpf_prots[family][config];
+}
+
+static int tcp_bpf_assert_proto_ops(struct proto *ops)
+{
+	/* In order to avoid retpoline, we make assumptions when we call
+	 * into ops if e.g. a psock is not present. Make sure they are
+	 * indeed valid assumptions.
+	 */
+	return ops->recvmsg  == tcp_recvmsg &&
+	       ops->sendmsg  == tcp_sendmsg &&
+	       ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
+}
+
+void tcp_bpf_reinit(struct sock *sk)
+{
+	struct sk_psock *psock;
+
+	sock_owned_by_me(sk);
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	tcp_bpf_reinit_sk_prot(sk, psock);
+	rcu_read_unlock();
+}
+
+int tcp_bpf_init(struct sock *sk)
+{
+	struct proto *ops = READ_ONCE(sk->sk_prot);
+	struct sk_psock *psock;
+
+	sock_owned_by_me(sk);
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (unlikely(!psock || psock->sk_proto ||
+		     tcp_bpf_assert_proto_ops(ops))) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+	tcp_bpf_check_v6_needs_rebuild(sk, ops);
+	tcp_bpf_update_sk_prot(sk, psock);
+	rcu_read_unlock();
+	return 0;
+}
diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig
index 6cff3f6d0c3a..94da19a2a220 100644
--- a/net/strparser/Kconfig
+++ b/net/strparser/Kconfig
@@ -1,4 +1,2 @@
-
 config STREAM_PARSER
-	tristate
-	default n
+	def_bool n
-- 
cgit v1.2.3


From d829e9c4112b52f4f00195900fd4c685f61365ab Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 13 Oct 2018 02:45:59 +0200
Subject: tls: convert to generic sk_msg interface

Convert kTLS over to make use of sk_msg interface for plaintext and
encrypted scattergather data, so it reuses all the sk_msg helpers
and data structure which later on in a second step enables to glue
this to BPF.

This also allows to remove quite a bit of open coded helpers which
are covered by the sk_msg API. Recent changes in kTLs 80ece6a03aaf
("tls: Remove redundant vars from tls record structure") and
4e6d47206c32 ("tls: Add support for inplace records encryption")
changed the data path handling a bit; while we've kept the latter
optimization intact, we had to undo the former change to better
fit the sk_msg model, hence the sg_aead_in and sg_aead_out have
been brought back and are linked into the sk_msg sgs. Now the kTLS
record contains a msg_plaintext and msg_encrypted sk_msg each.

In the original code, the zerocopy_from_iter() has been used out
of TX but also RX path. For the strparser skb-based RX path,
we've left the zerocopy_from_iter() in decrypt_internal() mostly
untouched, meaning it has been moved into tls_setup_from_iter()
with charging logic removed (as not used from RX). Given RX path
is not based on sk_msg objects, we haven't pursued setting up a
dummy sk_msg to call into sk_msg_zerocopy_from_iter(), but it
could be an option to prusue in a later step.

Joint work with John.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/skmsg.h |   2 +
 include/net/sock.h    |   4 -
 include/net/tls.h     |  18 +-
 net/core/skmsg.c      |  39 ++++
 net/core/sock.c       |  61 ------
 net/tls/Kconfig       |   1 +
 net/tls/tls_device.c  |   2 +-
 net/tls/tls_sw.c      | 511 ++++++++++++++++++--------------------------------
 8 files changed, 236 insertions(+), 402 deletions(-)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 95678103c4a0..4e84b3c2eff8 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -102,6 +102,8 @@ struct sk_psock {
 
 int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
 		 int elem_first_coalesce);
+int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
+		 u32 off, u32 len);
 void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len);
 int sk_msg_free(struct sock *sk, struct sk_msg *msg);
 int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg);
diff --git a/include/net/sock.h b/include/net/sock.h
index 751549ac0a84..7470c45d182d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2214,10 +2214,6 @@ static inline struct page_frag *sk_page_frag(struct sock *sk)
 
 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
 
-int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
-		int sg_start, int *sg_curr, unsigned int *sg_size,
-		int first_coalesce);
-
 /*
  *	Default write policy as shown to user space via poll/select/SIGIO
  */
diff --git a/include/net/tls.h b/include/net/tls.h
index 5e853835597e..3d22d8a59be7 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -39,6 +39,8 @@
 #include <linux/crypto.h>
 #include <linux/socket.h>
 #include <linux/tcp.h>
+#include <linux/skmsg.h>
+
 #include <net/tcp.h>
 #include <net/strparser.h>
 #include <crypto/aead.h>
@@ -103,15 +105,13 @@ struct tls_rec {
 	int tx_flags;
 	int inplace_crypto;
 
-	/* AAD | sg_plaintext_data | sg_tag */
-	struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS + 1];
-	/* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */
-	struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS + 1];
+	struct sk_msg msg_plaintext;
+	struct sk_msg msg_encrypted;
 
-	unsigned int sg_plaintext_size;
-	unsigned int sg_encrypted_size;
-	int sg_plaintext_num_elem;
-	int sg_encrypted_num_elem;
+	/* AAD | msg_plaintext.sg.data | sg_tag */
+	struct scatterlist sg_aead_in[2];
+	/* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */
+	struct scatterlist sg_aead_out[2];
 
 	char aad_space[TLS_AAD_SPACE_SIZE];
 	struct aead_request aead_req;
@@ -223,8 +223,8 @@ struct tls_context {
 
 	unsigned long flags;
 	bool in_tcp_sendpages;
+	bool pending_open_record_frags;
 
-	u16 pending_open_record_frags;
 	int (*push_pending_record)(struct sock *sk, int flags);
 
 	void (*sk_write_space)(struct sock *sk);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index ae2b281c9c57..56a99d0c9aa0 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -73,6 +73,45 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
 }
 EXPORT_SYMBOL_GPL(sk_msg_alloc);
 
+int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
+		 u32 off, u32 len)
+{
+	int i = src->sg.start;
+	struct scatterlist *sge = sk_msg_elem(src, i);
+	u32 sge_len, sge_off;
+
+	if (sk_msg_full(dst))
+		return -ENOSPC;
+
+	while (off) {
+		if (sge->length > off)
+			break;
+		off -= sge->length;
+		sk_msg_iter_var_next(i);
+		if (i == src->sg.end && off)
+			return -ENOSPC;
+		sge = sk_msg_elem(src, i);
+	}
+
+	while (len) {
+		sge_len = sge->length - off;
+		sge_off = sge->offset + off;
+		if (sge_len > len)
+			sge_len = len;
+		off = 0;
+		len -= sge_len;
+		sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off);
+		sk_mem_charge(sk, sge_len);
+		sk_msg_iter_var_next(i);
+		if (i == src->sg.end && len)
+			return -ENOSPC;
+		sge = sk_msg_elem(src, i);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sk_msg_clone);
+
 void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
 {
 	int i = msg->sg.start;
diff --git a/net/core/sock.c b/net/core/sock.c
index 7e8796a6a089..52e4f1c16b1e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2238,67 +2238,6 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
 }
 EXPORT_SYMBOL(sk_page_frag_refill);
 
-int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
-		int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
-		int first_coalesce)
-{
-	int sg_curr = *sg_curr_index, use = 0, rc = 0;
-	unsigned int size = *sg_curr_size;
-	struct page_frag *pfrag;
-	struct scatterlist *sge;
-
-	len -= size;
-	pfrag = sk_page_frag(sk);
-
-	while (len > 0) {
-		unsigned int orig_offset;
-
-		if (!sk_page_frag_refill(sk, pfrag)) {
-			rc = -ENOMEM;
-			goto out;
-		}
-
-		use = min_t(int, len, pfrag->size - pfrag->offset);
-
-		if (!sk_wmem_schedule(sk, use)) {
-			rc = -ENOMEM;
-			goto out;
-		}
-
-		sk_mem_charge(sk, use);
-		size += use;
-		orig_offset = pfrag->offset;
-		pfrag->offset += use;
-
-		sge = sg + sg_curr - 1;
-		if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
-		    sge->offset + sge->length == orig_offset) {
-			sge->length += use;
-		} else {
-			sge = sg + sg_curr;
-			sg_unmark_end(sge);
-			sg_set_page(sge, pfrag->page, use, orig_offset);
-			get_page(pfrag->page);
-			sg_curr++;
-
-			if (sg_curr == MAX_SKB_FRAGS)
-				sg_curr = 0;
-
-			if (sg_curr == sg_start) {
-				rc = -ENOSPC;
-				break;
-			}
-		}
-
-		len -= use;
-	}
-out:
-	*sg_curr_size = size;
-	*sg_curr_index = sg_curr;
-	return rc;
-}
-EXPORT_SYMBOL(sk_alloc_sg);
-
 static void __lock_sock(struct sock *sk)
 	__releases(&sk->sk_lock.slock)
 	__acquires(&sk->sk_lock.slock)
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index 73f05ece53d0..99c1a19c17b1 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -8,6 +8,7 @@ config TLS
 	select CRYPTO_AES
 	select CRYPTO_GCM
 	select STREAM_PARSER
+	select NET_SOCK_MSG
 	default n
 	---help---
 	Enable kernel support for TLS protocol. This allows symmetric
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 961b07d4d41c..276edbc04f38 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -421,7 +421,7 @@ last_record:
 			tls_push_record_flags = flags;
 			if (more) {
 				tls_ctx->pending_open_record_frags =
-						record->num_frags;
+						!!record->num_frags;
 				break;
 			}
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index aa9fdce272b6..5043b0be1448 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -213,153 +213,49 @@ static int tls_do_decryption(struct sock *sk,
 	return ret;
 }
 
-static void trim_sg(struct sock *sk, struct scatterlist *sg,
-		    int *sg_num_elem, unsigned int *sg_size, int target_size)
-{
-	int i = *sg_num_elem - 1;
-	int trim = *sg_size - target_size;
-
-	if (trim <= 0) {
-		WARN_ON(trim < 0);
-		return;
-	}
-
-	*sg_size = target_size;
-	while (trim >= sg[i].length) {
-		trim -= sg[i].length;
-		sk_mem_uncharge(sk, sg[i].length);
-		put_page(sg_page(&sg[i]));
-		i--;
-
-		if (i < 0)
-			goto out;
-	}
-
-	sg[i].length -= trim;
-	sk_mem_uncharge(sk, trim);
-
-out:
-	*sg_num_elem = i + 1;
-}
-
-static void trim_both_sgl(struct sock *sk, int target_size)
+static void tls_trim_both_msgs(struct sock *sk, int target_size)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct tls_rec *rec = ctx->open_rec;
 
-	trim_sg(sk, &rec->sg_plaintext_data[1],
-		&rec->sg_plaintext_num_elem,
-		&rec->sg_plaintext_size,
-		target_size);
-
+	sk_msg_trim(sk, &rec->msg_plaintext, target_size);
 	if (target_size > 0)
 		target_size += tls_ctx->tx.overhead_size;
-
-	trim_sg(sk, &rec->sg_encrypted_data[1],
-		&rec->sg_encrypted_num_elem,
-		&rec->sg_encrypted_size,
-		target_size);
+	sk_msg_trim(sk, &rec->msg_encrypted, target_size);
 }
 
-static int alloc_encrypted_sg(struct sock *sk, int len)
+static int tls_alloc_encrypted_msg(struct sock *sk, int len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct tls_rec *rec = ctx->open_rec;
-	int rc = 0;
-
-	rc = sk_alloc_sg(sk, len,
-			 &rec->sg_encrypted_data[1], 0,
-			 &rec->sg_encrypted_num_elem,
-			 &rec->sg_encrypted_size, 0);
-
-	if (rc == -ENOSPC)
-		rec->sg_encrypted_num_elem =
-			ARRAY_SIZE(rec->sg_encrypted_data) - 1;
+	struct sk_msg *msg_en = &rec->msg_encrypted;
 
-	return rc;
+	return sk_msg_alloc(sk, msg_en, len, 0);
 }
 
-static int move_to_plaintext_sg(struct sock *sk, int required_size)
+static int tls_clone_plaintext_msg(struct sock *sk, int required)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct tls_rec *rec = ctx->open_rec;
-	struct scatterlist *plain_sg = &rec->sg_plaintext_data[1];
-	struct scatterlist *enc_sg = &rec->sg_encrypted_data[1];
-	int enc_sg_idx = 0;
+	struct sk_msg *msg_pl = &rec->msg_plaintext;
+	struct sk_msg *msg_en = &rec->msg_encrypted;
 	int skip, len;
 
-	if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS)
-		return -ENOSPC;
-
-	/* We add page references worth len bytes from enc_sg at the
-	 * end of plain_sg. It is guaranteed that sg_encrypted_data
+	/* We add page references worth len bytes from encrypted sg
+	 * at the end of plaintext sg. It is guaranteed that msg_en
 	 * has enough required room (ensured by caller).
 	 */
-	len = required_size - rec->sg_plaintext_size;
+	len = required - msg_pl->sg.size;
 
-	/* Skip initial bytes in sg_encrypted_data to be able
-	 * to use same offset of both plain and encrypted data.
+	/* Skip initial bytes in msg_en's data to be able to use
+	 * same offset of both plain and encrypted data.
 	 */
-	skip = tls_ctx->tx.prepend_size + rec->sg_plaintext_size;
-
-	while (enc_sg_idx < rec->sg_encrypted_num_elem) {
-		if (enc_sg[enc_sg_idx].length > skip)
-			break;
-
-		skip -= enc_sg[enc_sg_idx].length;
-		enc_sg_idx++;
-	}
+	skip = tls_ctx->tx.prepend_size + msg_pl->sg.size;
 
-	/* unmark the end of plain_sg*/
-	sg_unmark_end(plain_sg + rec->sg_plaintext_num_elem - 1);
-
-	while (len) {
-		struct page *page = sg_page(&enc_sg[enc_sg_idx]);
-		int bytes = enc_sg[enc_sg_idx].length - skip;
-		int offset = enc_sg[enc_sg_idx].offset + skip;
-
-		if (bytes > len)
-			bytes = len;
-		else
-			enc_sg_idx++;
-
-		/* Skipping is required only one time */
-		skip = 0;
-
-		/* Increment page reference */
-		get_page(page);
-
-		sg_set_page(&plain_sg[rec->sg_plaintext_num_elem], page,
-			    bytes, offset);
-
-		sk_mem_charge(sk, bytes);
-
-		len -= bytes;
-		rec->sg_plaintext_size += bytes;
-
-		rec->sg_plaintext_num_elem++;
-
-		if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS)
-			return -ENOSPC;
-	}
-
-	return 0;
-}
-
-static void free_sg(struct sock *sk, struct scatterlist *sg,
-		    int *sg_num_elem, unsigned int *sg_size)
-{
-	int i, n = *sg_num_elem;
-
-	for (i = 0; i < n; ++i) {
-		sk_mem_uncharge(sk, sg[i].length);
-		put_page(sg_page(&sg[i]));
-	}
-	*sg_num_elem = 0;
-	*sg_size = 0;
+	return sk_msg_clone(sk, msg_pl, msg_en, skip, len);
 }
 
 static void tls_free_open_rec(struct sock *sk)
@@ -372,14 +268,8 @@ static void tls_free_open_rec(struct sock *sk)
 	if (!rec)
 		return;
 
-	free_sg(sk, &rec->sg_encrypted_data[1],
-		&rec->sg_encrypted_num_elem,
-		&rec->sg_encrypted_size);
-
-	free_sg(sk, &rec->sg_plaintext_data[1],
-		&rec->sg_plaintext_num_elem,
-		&rec->sg_plaintext_size);
-
+	sk_msg_free(sk, &rec->msg_encrypted);
+	sk_msg_free(sk, &rec->msg_plaintext);
 	kfree(rec);
 }
 
@@ -388,6 +278,7 @@ int tls_tx_records(struct sock *sk, int flags)
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct tls_rec *rec, *tmp;
+	struct sk_msg *msg_en;
 	int tx_flags, rc = 0;
 
 	if (tls_is_partially_sent_record(tls_ctx)) {
@@ -407,9 +298,7 @@ int tls_tx_records(struct sock *sk, int flags)
 		 * Remove the head of tx_list
 		 */
 		list_del(&rec->list);
-		free_sg(sk, &rec->sg_plaintext_data[1],
-			&rec->sg_plaintext_num_elem, &rec->sg_plaintext_size);
-
+		sk_msg_free(sk, &rec->msg_plaintext);
 		kfree(rec);
 	}
 
@@ -421,17 +310,15 @@ int tls_tx_records(struct sock *sk, int flags)
 			else
 				tx_flags = flags;
 
+			msg_en = &rec->msg_encrypted;
 			rc = tls_push_sg(sk, tls_ctx,
-					 &rec->sg_encrypted_data[1],
+					 &msg_en->sg.data[msg_en->sg.curr],
 					 0, tx_flags);
 			if (rc)
 				goto tx_err;
 
 			list_del(&rec->list);
-			free_sg(sk, &rec->sg_plaintext_data[1],
-				&rec->sg_plaintext_num_elem,
-				&rec->sg_plaintext_size);
-
+			sk_msg_free(sk, &rec->msg_plaintext);
 			kfree(rec);
 		} else {
 			break;
@@ -451,15 +338,18 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
 	struct sock *sk = req->data;
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct scatterlist *sge;
+	struct sk_msg *msg_en;
 	struct tls_rec *rec;
 	bool ready = false;
 	int pending;
 
 	rec = container_of(aead_req, struct tls_rec, aead_req);
+	msg_en = &rec->msg_encrypted;
 
-	rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size;
-	rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size;
-
+	sge = sk_msg_elem(msg_en, msg_en->sg.curr);
+	sge->offset -= tls_ctx->tx.prepend_size;
+	sge->length += tls_ctx->tx.prepend_size;
 
 	/* Check if error is previously set on socket */
 	if (err || sk->sk_err) {
@@ -497,31 +387,29 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
 
 	/* Schedule the transmission */
 	if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
-		schedule_delayed_work(&ctx->tx_work.work, 2);
+		schedule_delayed_work(&ctx->tx_work.work, 1);
 }
 
 static int tls_do_encryption(struct sock *sk,
 			     struct tls_context *tls_ctx,
 			     struct tls_sw_context_tx *ctx,
 			     struct aead_request *aead_req,
-			     size_t data_len)
+			     size_t data_len, u32 start)
 {
 	struct tls_rec *rec = ctx->open_rec;
-	struct scatterlist *plain_sg = rec->sg_plaintext_data;
-	struct scatterlist *enc_sg = rec->sg_encrypted_data;
+	struct sk_msg *msg_en = &rec->msg_encrypted;
+	struct scatterlist *sge = sk_msg_elem(msg_en, start);
 	int rc;
 
-	/* Skip the first index as it contains AAD data */
-	rec->sg_encrypted_data[1].offset += tls_ctx->tx.prepend_size;
-	rec->sg_encrypted_data[1].length -= tls_ctx->tx.prepend_size;
+	sge->offset += tls_ctx->tx.prepend_size;
+	sge->length -= tls_ctx->tx.prepend_size;
 
-	/* If it is inplace crypto, then pass same SG list as both src, dst */
-	if (rec->inplace_crypto)
-		plain_sg = enc_sg;
+	msg_en->sg.curr = start;
 
 	aead_request_set_tfm(aead_req, ctx->aead_send);
 	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
-	aead_request_set_crypt(aead_req, plain_sg, enc_sg,
+	aead_request_set_crypt(aead_req, rec->sg_aead_in,
+			       rec->sg_aead_out,
 			       data_len, tls_ctx->tx.iv);
 
 	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
@@ -534,8 +422,8 @@ static int tls_do_encryption(struct sock *sk,
 	rc = crypto_aead_encrypt(aead_req);
 	if (!rc || rc != -EINPROGRESS) {
 		atomic_dec(&ctx->encrypt_pending);
-		rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size;
-		rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size;
+		sge->offset -= tls_ctx->tx.prepend_size;
+		sge->length += tls_ctx->tx.prepend_size;
 	}
 
 	if (!rc) {
@@ -557,35 +445,50 @@ static int tls_push_record(struct sock *sk, int flags,
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct tls_rec *rec = ctx->open_rec;
+	struct sk_msg *msg_pl, *msg_en;
 	struct aead_request *req;
 	int rc;
+	u32 i;
 
 	if (!rec)
 		return 0;
 
+	msg_pl = &rec->msg_plaintext;
+	msg_en = &rec->msg_encrypted;
+
 	rec->tx_flags = flags;
 	req = &rec->aead_req;
 
-	sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem);
-	sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem);
+	i = msg_pl->sg.end;
+	sk_msg_iter_var_prev(i);
+	sg_mark_end(sk_msg_elem(msg_pl, i));
 
-	tls_make_aad(rec->aad_space, rec->sg_plaintext_size,
+	i = msg_pl->sg.start;
+	sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ?
+		 &msg_en->sg.data[i] : &msg_pl->sg.data[i]);
+
+	i = msg_en->sg.end;
+	sk_msg_iter_var_prev(i);
+	sg_mark_end(sk_msg_elem(msg_en, i));
+
+	i = msg_en->sg.start;
+	sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]);
+
+	tls_make_aad(rec->aad_space, msg_pl->sg.size,
 		     tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size,
 		     record_type);
 
 	tls_fill_prepend(tls_ctx,
-			 page_address(sg_page(&rec->sg_encrypted_data[1])) +
-			 rec->sg_encrypted_data[1].offset,
-			 rec->sg_plaintext_size, record_type);
+			 page_address(sg_page(&msg_en->sg.data[i])) +
+			 msg_en->sg.data[i].offset, msg_pl->sg.size,
+			 record_type);
 
-	tls_ctx->pending_open_record_frags = 0;
-
-	rc = tls_do_encryption(sk, tls_ctx, ctx, req, rec->sg_plaintext_size);
-	if (rc == -EINPROGRESS)
-		return -EINPROGRESS;
+	tls_ctx->pending_open_record_frags = false;
 
+	rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i);
 	if (rc < 0) {
-		tls_err_abort(sk, EBADMSG);
+		if (rc != -EINPROGRESS)
+			tls_err_abort(sk, EBADMSG);
 		return rc;
 	}
 
@@ -597,104 +500,11 @@ static int tls_sw_push_pending_record(struct sock *sk, int flags)
 	return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA);
 }
 
-static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
-			      int length, int *pages_used,
-			      unsigned int *size_used,
-			      struct scatterlist *to, int to_max_pages,
-			      bool charge)
-{
-	struct page *pages[MAX_SKB_FRAGS];
-
-	size_t offset;
-	ssize_t copied, use;
-	int i = 0;
-	unsigned int size = *size_used;
-	int num_elem = *pages_used;
-	int rc = 0;
-	int maxpages;
-
-	while (length > 0) {
-		i = 0;
-		maxpages = to_max_pages - num_elem;
-		if (maxpages == 0) {
-			rc = -EFAULT;
-			goto out;
-		}
-		copied = iov_iter_get_pages(from, pages,
-					    length,
-					    maxpages, &offset);
-		if (copied <= 0) {
-			rc = -EFAULT;
-			goto out;
-		}
-
-		iov_iter_advance(from, copied);
-
-		length -= copied;
-		size += copied;
-		while (copied) {
-			use = min_t(int, copied, PAGE_SIZE - offset);
-
-			sg_set_page(&to[num_elem],
-				    pages[i], use, offset);
-			sg_unmark_end(&to[num_elem]);
-			if (charge)
-				sk_mem_charge(sk, use);
-
-			offset = 0;
-			copied -= use;
-
-			++i;
-			++num_elem;
-		}
-	}
-
-	/* Mark the end in the last sg entry if newly added */
-	if (num_elem > *pages_used)
-		sg_mark_end(&to[num_elem - 1]);
-out:
-	if (rc)
-		iov_iter_revert(from, size - *size_used);
-	*size_used = size;
-	*pages_used = num_elem;
-
-	return rc;
-}
-
-static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
-			     int bytes)
-{
-	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
-	struct tls_rec *rec = ctx->open_rec;
-	struct scatterlist *sg = &rec->sg_plaintext_data[1];
-	int copy, i, rc = 0;
-
-	for (i = tls_ctx->pending_open_record_frags;
-	     i < rec->sg_plaintext_num_elem; ++i) {
-		copy = sg[i].length;
-		if (copy_from_iter(
-				page_address(sg_page(&sg[i])) + sg[i].offset,
-				copy, from) != copy) {
-			rc = -EFAULT;
-			goto out;
-		}
-		bytes -= copy;
-
-		++tls_ctx->pending_open_record_frags;
-
-		if (!bytes)
-			break;
-	}
-
-out:
-	return rc;
-}
-
 static struct tls_rec *get_rec(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct sk_msg *msg_pl, *msg_en;
 	struct tls_rec *rec;
 	int mem_size;
 
@@ -708,15 +518,21 @@ static struct tls_rec *get_rec(struct sock *sk)
 	if (!rec)
 		return NULL;
 
-	sg_init_table(&rec->sg_plaintext_data[0],
-		      ARRAY_SIZE(rec->sg_plaintext_data));
-	sg_init_table(&rec->sg_encrypted_data[0],
-		      ARRAY_SIZE(rec->sg_encrypted_data));
+	msg_pl = &rec->msg_plaintext;
+	msg_en = &rec->msg_encrypted;
+
+	sk_msg_init(msg_pl);
+	sk_msg_init(msg_en);
 
-	sg_set_buf(&rec->sg_plaintext_data[0], rec->aad_space,
+	sg_init_table(rec->sg_aead_in, 2);
+	sg_set_buf(&rec->sg_aead_in[0], rec->aad_space,
 		   sizeof(rec->aad_space));
-	sg_set_buf(&rec->sg_encrypted_data[0], rec->aad_space,
+	sg_unmark_end(&rec->sg_aead_in[1]);
+
+	sg_init_table(rec->sg_aead_out, 2);
+	sg_set_buf(&rec->sg_aead_out[0], rec->aad_space,
 		   sizeof(rec->aad_space));
+	sg_unmark_end(&rec->sg_aead_out[1]);
 
 	ctx->open_rec = rec;
 	rec->inplace_crypto = 1;
@@ -735,6 +551,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	bool is_kvec = msg->msg_iter.type & ITER_KVEC;
 	bool eor = !(msg->msg_flags & MSG_MORE);
 	size_t try_to_copy, copied = 0;
+	struct sk_msg *msg_pl, *msg_en;
 	struct tls_rec *rec;
 	int required_size;
 	int num_async = 0;
@@ -778,23 +595,26 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			goto send_end;
 		}
 
-		orig_size = rec->sg_plaintext_size;
+		msg_pl = &rec->msg_plaintext;
+		msg_en = &rec->msg_encrypted;
+
+		orig_size = msg_pl->sg.size;
 		full_record = false;
 		try_to_copy = msg_data_left(msg);
-		record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size;
+		record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
 		if (try_to_copy >= record_room) {
 			try_to_copy = record_room;
 			full_record = true;
 		}
 
-		required_size = rec->sg_plaintext_size + try_to_copy +
+		required_size = msg_pl->sg.size + try_to_copy +
 				tls_ctx->tx.overhead_size;
 
 		if (!sk_stream_memory_free(sk))
 			goto wait_for_sndbuf;
 
 alloc_encrypted:
-		ret = alloc_encrypted_sg(sk, required_size);
+		ret = tls_alloc_encrypted_msg(sk, required_size);
 		if (ret) {
 			if (ret != -ENOSPC)
 				goto wait_for_memory;
@@ -803,17 +623,13 @@ alloc_encrypted:
 			 * actually allocated. The difference is due
 			 * to max sg elements limit
 			 */
-			try_to_copy -= required_size - rec->sg_encrypted_size;
+			try_to_copy -= required_size - msg_en->sg.size;
 			full_record = true;
 		}
 
 		if (!is_kvec && (full_record || eor) && !async_capable) {
-			ret = zerocopy_from_iter(sk, &msg->msg_iter,
-				try_to_copy, &rec->sg_plaintext_num_elem,
-				&rec->sg_plaintext_size,
-				&rec->sg_plaintext_data[1],
-				ARRAY_SIZE(rec->sg_plaintext_data) - 1,
-				true);
+			ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter,
+							msg_pl, try_to_copy);
 			if (ret)
 				goto fallback_to_reg_send;
 
@@ -831,15 +647,12 @@ alloc_encrypted:
 			continue;
 
 fallback_to_reg_send:
-			trim_sg(sk, &rec->sg_plaintext_data[1],
-				&rec->sg_plaintext_num_elem,
-				&rec->sg_plaintext_size,
-				orig_size);
+			sk_msg_trim(sk, msg_pl, orig_size);
 		}
 
-		required_size = rec->sg_plaintext_size + try_to_copy;
+		required_size = msg_pl->sg.size + try_to_copy;
 
-		ret = move_to_plaintext_sg(sk, required_size);
+		ret = tls_clone_plaintext_msg(sk, required_size);
 		if (ret) {
 			if (ret != -ENOSPC)
 				goto send_end;
@@ -848,20 +661,21 @@ fallback_to_reg_send:
 			 * actually allocated. The difference is due
 			 * to max sg elements limit
 			 */
-			try_to_copy -= required_size - rec->sg_plaintext_size;
+			try_to_copy -= required_size - msg_pl->sg.size;
 			full_record = true;
-
-			trim_sg(sk, &rec->sg_encrypted_data[1],
-				&rec->sg_encrypted_num_elem,
-				&rec->sg_encrypted_size,
-				rec->sg_plaintext_size +
-				tls_ctx->tx.overhead_size);
+			sk_msg_trim(sk, msg_en, msg_pl->sg.size +
+				    tls_ctx->tx.overhead_size);
 		}
 
-		ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy);
-		if (ret)
+		ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl,
+					       try_to_copy);
+		if (ret < 0)
 			goto trim_sgl;
 
+		/* Open records defined only if successfully copied, otherwise
+		 * we would trim the sg but not reset the open record frags.
+		 */
+		tls_ctx->pending_open_record_frags = true;
 		copied += try_to_copy;
 		if (full_record || eor) {
 			ret = tls_push_record(sk, msg->msg_flags, record_type);
@@ -881,11 +695,11 @@ wait_for_memory:
 		ret = sk_stream_wait_memory(sk, &timeo);
 		if (ret) {
 trim_sgl:
-			trim_both_sgl(sk, orig_size);
+			tls_trim_both_msgs(sk, orig_size);
 			goto send_end;
 		}
 
-		if (rec->sg_encrypted_size < required_size)
+		if (msg_en->sg.size < required_size)
 			goto alloc_encrypted;
 	}
 
@@ -929,7 +743,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	unsigned char record_type = TLS_RECORD_TYPE_DATA;
 	size_t orig_size = size;
-	struct scatterlist *sg;
+	struct sk_msg *msg_pl;
 	struct tls_rec *rec;
 	int num_async = 0;
 	bool full_record;
@@ -970,20 +784,23 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 			goto sendpage_end;
 		}
 
+		msg_pl = &rec->msg_plaintext;
+
 		full_record = false;
-		record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size;
+		record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
 		copy = size;
 		if (copy >= record_room) {
 			copy = record_room;
 			full_record = true;
 		}
-		required_size = rec->sg_plaintext_size + copy +
-			      tls_ctx->tx.overhead_size;
+
+		required_size = msg_pl->sg.size + copy +
+				tls_ctx->tx.overhead_size;
 
 		if (!sk_stream_memory_free(sk))
 			goto wait_for_sndbuf;
 alloc_payload:
-		ret = alloc_encrypted_sg(sk, required_size);
+		ret = tls_alloc_encrypted_msg(sk, required_size);
 		if (ret) {
 			if (ret != -ENOSPC)
 				goto wait_for_memory;
@@ -992,26 +809,18 @@ alloc_payload:
 			 * actually allocated. The difference is due
 			 * to max sg elements limit
 			 */
-			copy -= required_size - rec->sg_plaintext_size;
+			copy -= required_size - msg_pl->sg.size;
 			full_record = true;
 		}
 
-		get_page(page);
-		sg = &rec->sg_plaintext_data[1] + rec->sg_plaintext_num_elem;
-		sg_set_page(sg, page, copy, offset);
-		sg_unmark_end(sg);
-
-		rec->sg_plaintext_num_elem++;
-
+		sk_msg_page_add(msg_pl, page, copy, offset);
 		sk_mem_charge(sk, copy);
+
 		offset += copy;
 		size -= copy;
-		rec->sg_plaintext_size += copy;
-		tls_ctx->pending_open_record_frags = rec->sg_plaintext_num_elem;
 
-		if (full_record || eor ||
-		    rec->sg_plaintext_num_elem ==
-		    ARRAY_SIZE(rec->sg_plaintext_data) - 1) {
+		tls_ctx->pending_open_record_frags = true;
+		if (full_record || eor || sk_msg_full(msg_pl)) {
 			rec->inplace_crypto = 0;
 			ret = tls_push_record(sk, flags, record_type);
 			if (ret) {
@@ -1027,7 +836,7 @@ wait_for_sndbuf:
 wait_for_memory:
 		ret = sk_stream_wait_memory(sk, &timeo);
 		if (ret) {
-			trim_both_sgl(sk, rec->sg_plaintext_size);
+			tls_trim_both_msgs(sk, msg_pl->sg.size);
 			goto sendpage_end;
 		}
 
@@ -1092,6 +901,64 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
 	return skb;
 }
 
+static int tls_setup_from_iter(struct sock *sk, struct iov_iter *from,
+			       int length, int *pages_used,
+			       unsigned int *size_used,
+			       struct scatterlist *to,
+			       int to_max_pages)
+{
+	int rc = 0, i = 0, num_elem = *pages_used, maxpages;
+	struct page *pages[MAX_SKB_FRAGS];
+	unsigned int size = *size_used;
+	ssize_t copied, use;
+	size_t offset;
+
+	while (length > 0) {
+		i = 0;
+		maxpages = to_max_pages - num_elem;
+		if (maxpages == 0) {
+			rc = -EFAULT;
+			goto out;
+		}
+		copied = iov_iter_get_pages(from, pages,
+					    length,
+					    maxpages, &offset);
+		if (copied <= 0) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		iov_iter_advance(from, copied);
+
+		length -= copied;
+		size += copied;
+		while (copied) {
+			use = min_t(int, copied, PAGE_SIZE - offset);
+
+			sg_set_page(&to[num_elem],
+				    pages[i], use, offset);
+			sg_unmark_end(&to[num_elem]);
+			/* We do not uncharge memory from this API */
+
+			offset = 0;
+			copied -= use;
+
+			i++;
+			num_elem++;
+		}
+	}
+	/* Mark the end in the last sg entry if newly added */
+	if (num_elem > *pages_used)
+		sg_mark_end(&to[num_elem - 1]);
+out:
+	if (rc)
+		iov_iter_revert(from, size - *size_used);
+	*size_used = size;
+	*pages_used = num_elem;
+
+	return rc;
+}
+
 /* This function decrypts the input skb into either out_iov or in out_sg
  * or in skb buffers itself. The input parameter 'zc' indicates if
  * zero-copy mode needs to be tried or not. With zero-copy mode, either
@@ -1189,9 +1056,9 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
 			sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE);
 
 			*chunk = 0;
-			err = zerocopy_from_iter(sk, out_iov, data_len, &pages,
-						 chunk, &sgout[1],
-						 (n_sgout - 1), false);
+			err = tls_setup_from_iter(sk, out_iov, data_len,
+						  &pages, chunk, &sgout[1],
+						  (n_sgout - 1));
 			if (err < 0)
 				goto fallback_to_reg_recv;
 		} else if (out_sg) {
@@ -1619,25 +1486,15 @@ void tls_sw_free_resources_tx(struct sock *sk)
 
 		rec = list_first_entry(&ctx->tx_list,
 				       struct tls_rec, list);
-
-		free_sg(sk, &rec->sg_plaintext_data[1],
-			&rec->sg_plaintext_num_elem,
-			&rec->sg_plaintext_size);
-
 		list_del(&rec->list);
+		sk_msg_free(sk, &rec->msg_plaintext);
 		kfree(rec);
 	}
 
 	list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) {
-		free_sg(sk, &rec->sg_encrypted_data[1],
-			&rec->sg_encrypted_num_elem,
-			&rec->sg_encrypted_size);
-
-		free_sg(sk, &rec->sg_plaintext_data[1],
-			&rec->sg_plaintext_num_elem,
-			&rec->sg_plaintext_size);
-
 		list_del(&rec->list);
+		sk_msg_free(sk, &rec->msg_encrypted);
+		sk_msg_free(sk, &rec->msg_plaintext);
 		kfree(rec);
 	}
 
-- 
cgit v1.2.3


From 924ad65ed01ee0eec5d2a3280c01c394343d6df7 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sat, 13 Oct 2018 02:46:00 +0200
Subject: tls: replace poll implementation with read hook

Instead of re-implementing poll routine use the poll callback to
trigger read from kTLS, we reuse the stream_memory_read callback
which is simpler and achieves the same. This helps to align sockmap
and kTLS so we can more easily embed BPF in kTLS.

Joint work with Daniel.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/tls.h  |  6 ++----
 net/tls/tls_main.c | 11 ++++++-----
 net/tls/tls_sw.c   | 16 +++-------------
 3 files changed, 11 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 3d22d8a59be7..bab5627ff5e3 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -142,8 +142,7 @@ struct tls_sw_context_rx {
 
 	struct strparser strp;
 	void (*saved_data_ready)(struct sock *sk);
-	unsigned int (*sk_poll)(struct file *file, struct socket *sock,
-				struct poll_table_struct *wait);
+
 	struct sk_buff *recv_pkt;
 	u8 control;
 	bool decrypted;
@@ -272,8 +271,7 @@ void tls_sw_free_resources_rx(struct sock *sk);
 void tls_sw_release_resources_rx(struct sock *sk);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		   int nonblock, int flags, int *addr_len);
-unsigned int tls_sw_poll(struct file *file, struct socket *sock,
-			 struct poll_table_struct *wait);
+bool tls_sw_stream_read(const struct sock *sk);
 ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
 			   struct pipe_inode_info *pipe,
 			   size_t len, unsigned int flags);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index b428069a1b05..e90b6d537077 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -620,12 +620,14 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 	prot[TLS_SW][TLS_BASE].sendpage		= tls_sw_sendpage;
 
 	prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
-	prot[TLS_BASE][TLS_SW].recvmsg		= tls_sw_recvmsg;
-	prot[TLS_BASE][TLS_SW].close		= tls_sk_proto_close;
+	prot[TLS_BASE][TLS_SW].recvmsg		  = tls_sw_recvmsg;
+	prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read;
+	prot[TLS_BASE][TLS_SW].close		  = tls_sk_proto_close;
 
 	prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
-	prot[TLS_SW][TLS_SW].recvmsg	= tls_sw_recvmsg;
-	prot[TLS_SW][TLS_SW].close	= tls_sk_proto_close;
+	prot[TLS_SW][TLS_SW].recvmsg		= tls_sw_recvmsg;
+	prot[TLS_SW][TLS_SW].stream_memory_read	= tls_sw_stream_read;
+	prot[TLS_SW][TLS_SW].close		= tls_sk_proto_close;
 
 #ifdef CONFIG_TLS_DEVICE
 	prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
@@ -724,7 +726,6 @@ static int __init tls_register(void)
 	build_protos(tls_prots[TLSV4], &tcp_prot);
 
 	tls_sw_proto_ops = inet_stream_ops;
-	tls_sw_proto_ops.poll = tls_sw_poll;
 	tls_sw_proto_ops.splice_read = tls_sw_splice_read;
 
 #ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 5043b0be1448..3b75e0dd51a2 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1352,23 +1352,15 @@ splice_read_end:
 	return copied ? : err;
 }
 
-unsigned int tls_sw_poll(struct file *file, struct socket *sock,
-			 struct poll_table_struct *wait)
+bool tls_sw_stream_read(const struct sock *sk)
 {
-	unsigned int ret;
-	struct sock *sk = sock->sk;
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 
-	/* Grab POLLOUT and POLLHUP from the underlying socket */
-	ret = ctx->sk_poll(file, sock, wait);
-
-	/* Clear POLLIN bits, and set based on recv_pkt */
-	ret &= ~(POLLIN | POLLRDNORM);
 	if (ctx->recv_pkt)
-		ret |= POLLIN | POLLRDNORM;
+		return true;
 
-	return ret;
+	return false;
 }
 
 static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
@@ -1686,8 +1678,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		sk->sk_data_ready = tls_data_ready;
 		write_unlock_bh(&sk->sk_callback_lock);
 
-		sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
-
 		strp_check_rcv(&sw_ctx_rx->strp);
 	}
 
-- 
cgit v1.2.3


From d3b18ad31f93d0b6bae105c679018a1ba7daa9ca Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sat, 13 Oct 2018 02:46:01 +0200
Subject: tls: add bpf support to sk_msg handling

This work adds BPF sk_msg verdict program support to kTLS
allowing BPF and kTLS to be combined together. Previously kTLS
and sk_msg verdict programs were mutually exclusive in the
ULP layer which created challenges for the orchestrator when
trying to apply TCP based policy, for example. To resolve this,
leveraging the work from previous patches that consolidates
the use of sk_msg, we can finally enable BPF sk_msg verdict
programs so they continue to run after the kTLS socket is
created. No change in behavior when kTLS is not used in
combination with BPF, the kselftest suite for kTLS also runs
successfully.

Joint work with Daniel.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/skmsg.h |  41 ++++-
 net/tls/tls_sw.c      | 439 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 414 insertions(+), 66 deletions(-)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 4e84b3c2eff8..0b919f0bc6d6 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -29,7 +29,11 @@ struct sk_msg_sg {
 	u32				size;
 	u32				copybreak;
 	bool				copy[MAX_MSG_FRAGS];
-	struct scatterlist		data[MAX_MSG_FRAGS];
+	/* The extra element is used for chaining the front and sections when
+	 * the list becomes partitioned (e.g. end < start). The crypto APIs
+	 * require the chaining.
+	 */
+	struct scatterlist		data[MAX_MSG_FRAGS + 1];
 };
 
 struct sk_msg {
@@ -112,6 +116,7 @@ void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
 				  u32 bytes);
 
 void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes);
+void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes);
 
 int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
 			      struct sk_msg *msg, u32 bytes);
@@ -161,8 +166,9 @@ static inline void sk_msg_clear_meta(struct sk_msg *msg)
 
 static inline void sk_msg_init(struct sk_msg *msg)
 {
+	BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != MAX_MSG_FRAGS);
 	memset(msg, 0, sizeof(*msg));
-	sg_init_marker(msg->sg.data, ARRAY_SIZE(msg->sg.data));
+	sg_init_marker(msg->sg.data, MAX_MSG_FRAGS);
 }
 
 static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src,
@@ -174,6 +180,12 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src,
 	src->sg.data[which].offset += size;
 }
 
+static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src)
+{
+	memcpy(dst, src, sizeof(*src));
+	sk_msg_init(src);
+}
+
 static inline u32 sk_msg_elem_used(const struct sk_msg *msg)
 {
 	return msg->sg.end >= msg->sg.start ?
@@ -229,6 +241,26 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page,
 	sk_msg_iter_next(msg, end);
 }
 
+static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state)
+{
+	do {
+		msg->sg.copy[i] = copy_state;
+		sk_msg_iter_var_next(i);
+		if (i == msg->sg.end)
+			break;
+	} while (1);
+}
+
+static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start)
+{
+	sk_msg_sg_copy(msg, start, true);
+}
+
+static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start)
+{
+	sk_msg_sg_copy(msg, start, false);
+}
+
 static inline struct sk_psock *sk_psock(const struct sock *sk)
 {
 	return rcu_dereference_sk_user_data(sk);
@@ -245,6 +277,11 @@ static inline void sk_psock_queue_msg(struct sk_psock *psock,
 	list_add_tail(&msg->list, &psock->ingress_msg);
 }
 
+static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
+{
+	return psock ? list_empty(&psock->ingress_msg) : true;
+}
+
 static inline void sk_psock_report_error(struct sk_psock *psock, int err)
 {
 	struct sock *sk = psock->sk;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 3b75e0dd51a2..a525fc4c2a4b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -4,6 +4,7 @@
  * Copyright (c) 2016-2017, Lance Chao <lancerchao@fb.com>. All rights reserved.
  * Copyright (c) 2016, Fridolin Pokorny <fridolin.pokorny@gmail.com>. All rights reserved.
  * Copyright (c) 2016, Nikos Mavrogiannopoulos <nmav@gnutls.org>. All rights reserved.
+ * Copyright (c) 2018, Covalent IO, Inc. http://covalent.io
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -258,21 +259,58 @@ static int tls_clone_plaintext_msg(struct sock *sk, int required)
 	return sk_msg_clone(sk, msg_pl, msg_en, skip, len);
 }
 
-static void tls_free_open_rec(struct sock *sk)
+static struct tls_rec *tls_get_rec(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
-	struct tls_rec *rec = ctx->open_rec;
+	struct sk_msg *msg_pl, *msg_en;
+	struct tls_rec *rec;
+	int mem_size;
 
-	/* Return if there is no open record */
+	mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send);
+
+	rec = kzalloc(mem_size, sk->sk_allocation);
 	if (!rec)
-		return;
+		return NULL;
 
+	msg_pl = &rec->msg_plaintext;
+	msg_en = &rec->msg_encrypted;
+
+	sk_msg_init(msg_pl);
+	sk_msg_init(msg_en);
+
+	sg_init_table(rec->sg_aead_in, 2);
+	sg_set_buf(&rec->sg_aead_in[0], rec->aad_space,
+		   sizeof(rec->aad_space));
+	sg_unmark_end(&rec->sg_aead_in[1]);
+
+	sg_init_table(rec->sg_aead_out, 2);
+	sg_set_buf(&rec->sg_aead_out[0], rec->aad_space,
+		   sizeof(rec->aad_space));
+	sg_unmark_end(&rec->sg_aead_out[1]);
+
+	return rec;
+}
+
+static void tls_free_rec(struct sock *sk, struct tls_rec *rec)
+{
 	sk_msg_free(sk, &rec->msg_encrypted);
 	sk_msg_free(sk, &rec->msg_plaintext);
 	kfree(rec);
 }
 
+static void tls_free_open_rec(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct tls_rec *rec = ctx->open_rec;
+
+	if (rec) {
+		tls_free_rec(sk, rec);
+		ctx->open_rec = NULL;
+	}
+}
+
 int tls_tx_records(struct sock *sk, int flags)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
@@ -439,16 +477,135 @@ static int tls_do_encryption(struct sock *sk,
 	return rc;
 }
 
+static int tls_split_open_record(struct sock *sk, struct tls_rec *from,
+				 struct tls_rec **to, struct sk_msg *msg_opl,
+				 struct sk_msg *msg_oen, u32 split_point,
+				 u32 tx_overhead_size, u32 *orig_end)
+{
+	u32 i, j, bytes = 0, apply = msg_opl->apply_bytes;
+	struct scatterlist *sge, *osge, *nsge;
+	u32 orig_size = msg_opl->sg.size;
+	struct scatterlist tmp = { };
+	struct sk_msg *msg_npl;
+	struct tls_rec *new;
+	int ret;
+
+	new = tls_get_rec(sk);
+	if (!new)
+		return -ENOMEM;
+	ret = sk_msg_alloc(sk, &new->msg_encrypted, msg_opl->sg.size +
+			   tx_overhead_size, 0);
+	if (ret < 0) {
+		tls_free_rec(sk, new);
+		return ret;
+	}
+
+	*orig_end = msg_opl->sg.end;
+	i = msg_opl->sg.start;
+	sge = sk_msg_elem(msg_opl, i);
+	while (apply && sge->length) {
+		if (sge->length > apply) {
+			u32 len = sge->length - apply;
+
+			get_page(sg_page(sge));
+			sg_set_page(&tmp, sg_page(sge), len,
+				    sge->offset + apply);
+			sge->length = apply;
+			bytes += apply;
+			apply = 0;
+		} else {
+			apply -= sge->length;
+			bytes += sge->length;
+		}
+
+		sk_msg_iter_var_next(i);
+		if (i == msg_opl->sg.end)
+			break;
+		sge = sk_msg_elem(msg_opl, i);
+	}
+
+	msg_opl->sg.end = i;
+	msg_opl->sg.curr = i;
+	msg_opl->sg.copybreak = 0;
+	msg_opl->apply_bytes = 0;
+	msg_opl->sg.size = bytes;
+
+	msg_npl = &new->msg_plaintext;
+	msg_npl->apply_bytes = apply;
+	msg_npl->sg.size = orig_size - bytes;
+
+	j = msg_npl->sg.start;
+	nsge = sk_msg_elem(msg_npl, j);
+	if (tmp.length) {
+		memcpy(nsge, &tmp, sizeof(*nsge));
+		sk_msg_iter_var_next(j);
+		nsge = sk_msg_elem(msg_npl, j);
+	}
+
+	osge = sk_msg_elem(msg_opl, i);
+	while (osge->length) {
+		memcpy(nsge, osge, sizeof(*nsge));
+		sg_unmark_end(nsge);
+		sk_msg_iter_var_next(i);
+		sk_msg_iter_var_next(j);
+		if (i == *orig_end)
+			break;
+		osge = sk_msg_elem(msg_opl, i);
+		nsge = sk_msg_elem(msg_npl, j);
+	}
+
+	msg_npl->sg.end = j;
+	msg_npl->sg.curr = j;
+	msg_npl->sg.copybreak = 0;
+
+	*to = new;
+	return 0;
+}
+
+static void tls_merge_open_record(struct sock *sk, struct tls_rec *to,
+				  struct tls_rec *from, u32 orig_end)
+{
+	struct sk_msg *msg_npl = &from->msg_plaintext;
+	struct sk_msg *msg_opl = &to->msg_plaintext;
+	struct scatterlist *osge, *nsge;
+	u32 i, j;
+
+	i = msg_opl->sg.end;
+	sk_msg_iter_var_prev(i);
+	j = msg_npl->sg.start;
+
+	osge = sk_msg_elem(msg_opl, i);
+	nsge = sk_msg_elem(msg_npl, j);
+
+	if (sg_page(osge) == sg_page(nsge) &&
+	    osge->offset + osge->length == nsge->offset) {
+		osge->length += nsge->length;
+		put_page(sg_page(nsge));
+	}
+
+	msg_opl->sg.end = orig_end;
+	msg_opl->sg.curr = orig_end;
+	msg_opl->sg.copybreak = 0;
+	msg_opl->apply_bytes = msg_opl->sg.size + msg_npl->sg.size;
+	msg_opl->sg.size += msg_npl->sg.size;
+
+	sk_msg_free(sk, &to->msg_encrypted);
+	sk_msg_xfer_full(&to->msg_encrypted, &from->msg_encrypted);
+
+	kfree(from);
+}
+
 static int tls_push_record(struct sock *sk, int flags,
 			   unsigned char record_type)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
-	struct tls_rec *rec = ctx->open_rec;
+	struct tls_rec *rec = ctx->open_rec, *tmp = NULL;
+	u32 i, split_point, uninitialized_var(orig_end);
 	struct sk_msg *msg_pl, *msg_en;
 	struct aead_request *req;
+	bool split;
 	int rc;
-	u32 i;
 
 	if (!rec)
 		return 0;
@@ -456,6 +613,18 @@ static int tls_push_record(struct sock *sk, int flags,
 	msg_pl = &rec->msg_plaintext;
 	msg_en = &rec->msg_encrypted;
 
+	split_point = msg_pl->apply_bytes;
+	split = split_point && split_point < msg_pl->sg.size;
+	if (split) {
+		rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en,
+					   split_point, tls_ctx->tx.overhead_size,
+					   &orig_end);
+		if (rc < 0)
+			return rc;
+		sk_msg_trim(sk, msg_en, msg_pl->sg.size +
+			    tls_ctx->tx.overhead_size);
+	}
+
 	rec->tx_flags = flags;
 	req = &rec->aead_req;
 
@@ -487,57 +656,139 @@ static int tls_push_record(struct sock *sk, int flags,
 
 	rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i);
 	if (rc < 0) {
-		if (rc != -EINPROGRESS)
+		if (rc != -EINPROGRESS) {
 			tls_err_abort(sk, EBADMSG);
+			if (split) {
+				tls_ctx->pending_open_record_frags = true;
+				tls_merge_open_record(sk, rec, tmp, orig_end);
+			}
+		}
 		return rc;
+	} else if (split) {
+		msg_pl = &tmp->msg_plaintext;
+		msg_en = &tmp->msg_encrypted;
+		sk_msg_trim(sk, msg_en, msg_pl->sg.size +
+			    tls_ctx->tx.overhead_size);
+		tls_ctx->pending_open_record_frags = true;
+		ctx->open_rec = tmp;
 	}
 
 	return tls_tx_records(sk, flags);
 }
 
-static int tls_sw_push_pending_record(struct sock *sk, int flags)
-{
-	return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA);
-}
-
-static struct tls_rec *get_rec(struct sock *sk)
+static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
+			       bool full_record, u8 record_type,
+			       size_t *copied, int flags)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
-	struct sk_msg *msg_pl, *msg_en;
+	struct sk_msg msg_redir = { };
+	struct sk_psock *psock;
+	struct sock *sk_redir;
 	struct tls_rec *rec;
-	int mem_size;
+	int err = 0, send;
+	bool enospc;
+
+	psock = sk_psock_get(sk);
+	if (!psock)
+		return tls_push_record(sk, flags, record_type);
+more_data:
+	enospc = sk_msg_full(msg);
+	if (psock->eval == __SK_NONE)
+		psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+	if (msg->cork_bytes && msg->cork_bytes > msg->sg.size &&
+	    !enospc && !full_record) {
+		err = -ENOSPC;
+		goto out_err;
+	}
+	msg->cork_bytes = 0;
+	send = msg->sg.size;
+	if (msg->apply_bytes && msg->apply_bytes < send)
+		send = msg->apply_bytes;
+
+	switch (psock->eval) {
+	case __SK_PASS:
+		err = tls_push_record(sk, flags, record_type);
+		if (err < 0) {
+			*copied -= sk_msg_free(sk, msg);
+			tls_free_open_rec(sk);
+			goto out_err;
+		}
+		break;
+	case __SK_REDIRECT:
+		sk_redir = psock->sk_redir;
+		memcpy(&msg_redir, msg, sizeof(*msg));
+		if (msg->apply_bytes < send)
+			msg->apply_bytes = 0;
+		else
+			msg->apply_bytes -= send;
+		sk_msg_return_zero(sk, msg, send);
+		msg->sg.size -= send;
+		release_sock(sk);
+		err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags);
+		lock_sock(sk);
+		if (err < 0) {
+			*copied -= sk_msg_free_nocharge(sk, &msg_redir);
+			msg->sg.size = 0;
+		}
+		if (msg->sg.size == 0)
+			tls_free_open_rec(sk);
+		break;
+	case __SK_DROP:
+	default:
+		sk_msg_free_partial(sk, msg, send);
+		if (msg->apply_bytes < send)
+			msg->apply_bytes = 0;
+		else
+			msg->apply_bytes -= send;
+		if (msg->sg.size == 0)
+			tls_free_open_rec(sk);
+		*copied -= send;
+		err = -EACCES;
+	}
 
-	/* Return if we already have an open record */
-	if (ctx->open_rec)
-		return ctx->open_rec;
+	if (likely(!err)) {
+		bool reset_eval = !ctx->open_rec;
 
-	mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send);
+		rec = ctx->open_rec;
+		if (rec) {
+			msg = &rec->msg_plaintext;
+			if (!msg->apply_bytes)
+				reset_eval = true;
+		}
+		if (reset_eval) {
+			psock->eval = __SK_NONE;
+			if (psock->sk_redir) {
+				sock_put(psock->sk_redir);
+				psock->sk_redir = NULL;
+			}
+		}
+		if (rec)
+			goto more_data;
+	}
+ out_err:
+	sk_psock_put(sk, psock);
+	return err;
+}
+
+static int tls_sw_push_pending_record(struct sock *sk, int flags)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+	struct tls_rec *rec = ctx->open_rec;
+	struct sk_msg *msg_pl;
+	size_t copied;
 
-	rec = kzalloc(mem_size, sk->sk_allocation);
 	if (!rec)
-		return NULL;
+		return 0;
 
 	msg_pl = &rec->msg_plaintext;
-	msg_en = &rec->msg_encrypted;
-
-	sk_msg_init(msg_pl);
-	sk_msg_init(msg_en);
-
-	sg_init_table(rec->sg_aead_in, 2);
-	sg_set_buf(&rec->sg_aead_in[0], rec->aad_space,
-		   sizeof(rec->aad_space));
-	sg_unmark_end(&rec->sg_aead_in[1]);
-
-	sg_init_table(rec->sg_aead_out, 2);
-	sg_set_buf(&rec->sg_aead_out[0], rec->aad_space,
-		   sizeof(rec->aad_space));
-	sg_unmark_end(&rec->sg_aead_out[1]);
-
-	ctx->open_rec = rec;
-	rec->inplace_crypto = 1;
+	copied = msg_pl->sg.size;
+	if (!copied)
+		return 0;
 
-	return rec;
+	return bpf_exec_tx_verdict(msg_pl, sk, true, TLS_RECORD_TYPE_DATA,
+				   &copied, flags);
 }
 
 int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
@@ -589,7 +840,10 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			goto send_end;
 		}
 
-		rec = get_rec(sk);
+		if (ctx->open_rec)
+			rec = ctx->open_rec;
+		else
+			rec = ctx->open_rec = tls_get_rec(sk);
 		if (!rec) {
 			ret = -ENOMEM;
 			goto send_end;
@@ -628,6 +882,8 @@ alloc_encrypted:
 		}
 
 		if (!is_kvec && (full_record || eor) && !async_capable) {
+			u32 first = msg_pl->sg.end;
+
 			ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter,
 							msg_pl, try_to_copy);
 			if (ret)
@@ -637,15 +893,27 @@ alloc_encrypted:
 
 			num_zc++;
 			copied += try_to_copy;
-			ret = tls_push_record(sk, msg->msg_flags, record_type);
+
+			sk_msg_sg_copy_set(msg_pl, first);
+			ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
+						  record_type, &copied,
+						  msg->msg_flags);
 			if (ret) {
 				if (ret == -EINPROGRESS)
 					num_async++;
+				else if (ret == -ENOMEM)
+					goto wait_for_memory;
+				else if (ret == -ENOSPC)
+					goto rollback_iter;
 				else if (ret != -EAGAIN)
 					goto send_end;
 			}
 			continue;
-
+rollback_iter:
+			copied -= try_to_copy;
+			sk_msg_sg_copy_clear(msg_pl, first);
+			iov_iter_revert(&msg->msg_iter,
+					msg_pl->sg.size - orig_size);
 fallback_to_reg_send:
 			sk_msg_trim(sk, msg_pl, orig_size);
 		}
@@ -678,12 +946,19 @@ fallback_to_reg_send:
 		tls_ctx->pending_open_record_frags = true;
 		copied += try_to_copy;
 		if (full_record || eor) {
-			ret = tls_push_record(sk, msg->msg_flags, record_type);
+			ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
+						  record_type, &copied,
+						  msg->msg_flags);
 			if (ret) {
 				if (ret == -EINPROGRESS)
 					num_async++;
-				else if (ret != -EAGAIN)
+				else if (ret == -ENOMEM)
+					goto wait_for_memory;
+				else if (ret != -EAGAIN) {
+					if (ret == -ENOSPC)
+						ret = 0;
 					goto send_end;
+				}
 			}
 		}
 
@@ -742,10 +1017,10 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	unsigned char record_type = TLS_RECORD_TYPE_DATA;
-	size_t orig_size = size;
 	struct sk_msg *msg_pl;
 	struct tls_rec *rec;
 	int num_async = 0;
+	size_t copied = 0;
 	bool full_record;
 	int record_room;
 	int ret = 0;
@@ -778,7 +1053,10 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 			goto sendpage_end;
 		}
 
-		rec = get_rec(sk);
+		if (ctx->open_rec)
+			rec = ctx->open_rec;
+		else
+			rec = ctx->open_rec = tls_get_rec(sk);
 		if (!rec) {
 			ret = -ENOMEM;
 			goto sendpage_end;
@@ -788,6 +1066,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 
 		full_record = false;
 		record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
+		copied = 0;
 		copy = size;
 		if (copy >= record_room) {
 			copy = record_room;
@@ -818,16 +1097,23 @@ alloc_payload:
 
 		offset += copy;
 		size -= copy;
+		copied += copy;
 
 		tls_ctx->pending_open_record_frags = true;
 		if (full_record || eor || sk_msg_full(msg_pl)) {
 			rec->inplace_crypto = 0;
-			ret = tls_push_record(sk, flags, record_type);
+			ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
+						  record_type, &copied, flags);
 			if (ret) {
 				if (ret == -EINPROGRESS)
 					num_async++;
-				else if (ret != -EAGAIN)
+				else if (ret == -ENOMEM)
+					goto wait_for_memory;
+				else if (ret != -EAGAIN) {
+					if (ret == -ENOSPC)
+						ret = 0;
 					goto sendpage_end;
+				}
 			}
 		}
 		continue;
@@ -851,24 +1137,20 @@ wait_for_memory:
 		}
 	}
 sendpage_end:
-	if (orig_size > size)
-		ret = orig_size - size;
-	else
-		ret = sk_stream_error(sk, flags, ret);
-
+	ret = sk_stream_error(sk, flags, ret);
 	release_sock(sk);
-	return ret;
+	return copied ? copied : ret;
 }
 
-static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
-				     long timeo, int *err)
+static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock,
+				     int flags, long timeo, int *err)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct sk_buff *skb;
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
-	while (!(skb = ctx->recv_pkt)) {
+	while (!(skb = ctx->recv_pkt) && sk_psock_queue_empty(psock)) {
 		if (sk->sk_err) {
 			*err = sock_error(sk);
 			return NULL;
@@ -887,7 +1169,10 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
 
 		add_wait_queue(sk_sleep(sk), &wait);
 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-		sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait);
+		sk_wait_event(sk, &timeo,
+			      ctx->recv_pkt != skb ||
+			      !sk_psock_queue_empty(psock),
+			      &wait);
 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		remove_wait_queue(sk_sleep(sk), &wait);
 
@@ -1164,6 +1449,7 @@ int tls_sw_recvmsg(struct sock *sk,
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+	struct sk_psock *psock;
 	unsigned char control;
 	struct strp_msg *rxm;
 	struct sk_buff *skb;
@@ -1179,6 +1465,7 @@ int tls_sw_recvmsg(struct sock *sk,
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR);
 
+	psock = sk_psock_get(sk);
 	lock_sock(sk);
 
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
@@ -1188,9 +1475,19 @@ int tls_sw_recvmsg(struct sock *sk,
 		bool async = false;
 		int chunk = 0;
 
-		skb = tls_wait_data(sk, flags, timeo, &err);
-		if (!skb)
+		skb = tls_wait_data(sk, psock, flags, timeo, &err);
+		if (!skb) {
+			if (psock) {
+				int ret = __tcp_bpf_recvmsg(sk, psock, msg, len);
+
+				if (ret > 0) {
+					copied += ret;
+					len -= ret;
+					continue;
+				}
+			}
 			goto recv_end;
+		}
 
 		rxm = strp_msg(skb);
 
@@ -1296,6 +1593,8 @@ recv_end:
 	}
 
 	release_sock(sk);
+	if (psock)
+		sk_psock_put(sk, psock);
 	return copied ? : err;
 }
 
@@ -1318,7 +1617,7 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
 
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
-	skb = tls_wait_data(sk, flags, timeo, &err);
+	skb = tls_wait_data(sk, NULL, flags, timeo, &err);
 	if (!skb)
 		goto splice_read_end;
 
@@ -1356,11 +1655,16 @@ bool tls_sw_stream_read(const struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+	bool ingress_empty = true;
+	struct sk_psock *psock;
 
-	if (ctx->recv_pkt)
-		return true;
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (psock)
+		ingress_empty = list_empty(&psock->ingress_msg);
+	rcu_read_unlock();
 
-	return false;
+	return !ingress_empty || ctx->recv_pkt;
 }
 
 static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
@@ -1439,8 +1743,15 @@ static void tls_data_ready(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+	struct sk_psock *psock;
 
 	strp_data_ready(&ctx->strp);
+
+	psock = sk_psock_get(sk);
+	if (psock && !list_empty(&psock->ingress_msg)) {
+		ctx->saved_data_ready(sk);
+		sk_psock_put(sk, psock);
+	}
 }
 
 void tls_sw_free_resources_tx(struct sock *sk)
-- 
cgit v1.2.3


From 8a615c6b0352a9ec56151b6c95d68e0a2eef5cf0 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Mon, 15 Oct 2018 10:27:45 -0700
Subject: bpf: Allow sk_lookup with IPv6 module

This is a more complete fix than d71019b54bff ("net: core: Fix build
with CONFIG_IPV6=m"), so that IPv6 sockets may be looked up if the IPv6
module is loaded (not just if it's compiled in).

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/addrconf.h |  5 +++++
 net/core/filter.c      | 12 +++++++-----
 net/ipv6/af_inet6.c    |  1 +
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 6def0351bcc3..14b789a123e7 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -265,6 +265,11 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly;
 struct ipv6_bpf_stub {
 	int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 			  bool force_bind_address_no_port, bool with_lock);
+	struct sock *(*udp6_lib_lookup)(struct net *net,
+					const struct in6_addr *saddr, __be16 sport,
+					const struct in6_addr *daddr, __be16 dport,
+					int dif, int sdif, struct udp_table *tbl,
+					struct sk_buff *skb);
 };
 extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 0f5260b04bfe..f76ba31a8cc7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4717,7 +4717,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
 			sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
 					       dst4, tuple->ipv4.dport,
 					       dif, sdif, &udp_table, skb);
-#if IS_REACHABLE(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
 	} else {
 		struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
 		struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
@@ -4728,10 +4728,12 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
 					    src6, tuple->ipv6.sport,
 					    dst6, tuple->ipv6.dport,
 					    dif, sdif, &refcounted);
-		else
-			sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport,
-					       dst6, tuple->ipv6.dport,
-					       dif, sdif, &udp_table, skb);
+		else if (likely(ipv6_bpf_stub))
+			sk = ipv6_bpf_stub->udp6_lib_lookup(net,
+							    src6, tuple->ipv6.sport,
+							    dst6, tuple->ipv6.dport,
+							    dif, sdif,
+							    &udp_table, skb);
 #endif
 	}
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e9c8cfdf4b4c..3f4d61017a69 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -901,6 +901,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
 
 static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
 	.inet6_bind = __inet6_bind,
+	.udp6_lib_lookup = __udp6_lib_lookup,
 };
 
 static int __init inet6_init(void)
-- 
cgit v1.2.3


From 61414f5ec9834df8aa4f55c90de16b71a3d6ca8d Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 9 Oct 2018 23:57:43 +0100
Subject: FDDI: defza: Add support for DEC FDDIcontroller 700 TURBOchannel
 adapter

Add support for the DEC FDDIcontroller 700 (DEFZA), Digital Equipment
Corporation's first-generation FDDI network interface adapter, made for
TURBOchannel and based on a discrete version of what eventually became
Motorola's widely used CAMEL chipset.

The CAMEL chipset is present for example in the DEC FDDIcontroller
TURBOchannel, EISA and PCI adapters (DEFTA/DEFEA/DEFPA) that we support
with the `defxx' driver, however the host bus interface logic and the
firmware API are different in the DEFZA and hence a separate driver is
required.

There isn't much to say about the driver except that it works, but there
is one peculiarity to mention.  The adapter implements two Tx/Rx queue
pairs.

Of these one pair is the usual network Tx/Rx queue pair, in this case
used by the adapter to exchange frames with the ring, via the RMC (Ring
Memory Controller) chip.  The Tx queue is handled directly by the RMC
chip and resides in onboard packet memory.  The Rx queue is maintained
via DMA in host memory by adapter's firmware copying received data
stored by the RMC in onboard packet memory.

The other pair is used to communicate SMT frames with adapter's
firmware.  Any SMT frame received from the RMC via the Rx queue must be
queued back by the driver to the SMT Rx queue for the firmware to
process.  Similarly the firmware uses the SMT Tx queue to supply the
driver with SMT frames that must be queued back to the Tx queue for the
RMC to send to the ring.

This solution was chosen because the designers ran out of PCB space and
could not squeeze in more logic onto the board that would be required to
handle this SMT frame traffic without the need to involve the driver, as
with the later DEFTA/DEFEA/DEFPA adapters.

Finally the driver does some Frame Control byte decoding, so to avoid
magic numbers some macros are added to <linux/if_fddi.h>.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/00-INDEX  |    2 +
 Documentation/networking/defza.txt |   57 ++
 MAINTAINERS                        |    5 +
 drivers/net/fddi/Kconfig           |   11 +
 drivers/net/fddi/Makefile          |    1 +
 drivers/net/fddi/defza.c           | 1535 ++++++++++++++++++++++++++++++++++++
 drivers/net/fddi/defza.h           |  791 +++++++++++++++++++
 include/uapi/linux/if_fddi.h       |   21 +-
 8 files changed, 2420 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/networking/defza.txt
 create mode 100644 drivers/net/fddi/defza.c
 create mode 100644 drivers/net/fddi/defza.h

(limited to 'include')

diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index f4f2b5d6c8d8..2d239770b95f 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -56,6 +56,8 @@ de4x5.txt
 	- the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver
 decnet.txt
 	- info on using the DECnet networking layer in Linux.
+defza.txt
+	- the DEC FDDIcontroller 700 (DEFZA-xx) TURBOchannel FDDI driver
 dl2k.txt
 	- README for D-Link DL2000-based Gigabit Ethernet Adapters (dl2k.ko).
 dm9000.txt
diff --git a/Documentation/networking/defza.txt b/Documentation/networking/defza.txt
new file mode 100644
index 000000000000..663e4a906751
--- /dev/null
+++ b/Documentation/networking/defza.txt
@@ -0,0 +1,57 @@
+Notes on the DEC FDDIcontroller 700 (DEFZA-xx) driver v.1.1.4.
+
+
+DEC FDDIcontroller 700 is DEC's first-generation TURBOchannel FDDI
+network card, designed in 1990 specifically for the DECstation 5000
+model 200 workstation.  The board is a single attachment station and
+it was manufactured in two variations, both of which are supported.
+
+First is the SAS MMF DEFZA-AA option, the original design implementing
+the standard MMF-PMD, however with a pair of ST connectors rather than
+the usual MIC connector.  The other one is the SAS ThinWire/STP DEFZA-CA
+option, denoted 700-C, with the network medium selectable by a switch
+between the DEC proprietary ThinWire-PMD using a BNC connector and the
+standard STP-PMD using a DE-9F connector.  This option can interface to
+a DECconcentrator 500 device and, in the case of the STP-PMD, also other
+FDDI equipment and was designed to make it easier to transition from
+existing IEEE 802.3 10BASE2 Ethernet and IEEE 802.5 Token Ring networks
+by providing means to reuse existing cabling.
+
+This driver handles any number of cards installed in a single system.
+They get fddi0, fddi1, etc. interface names assigned in the order of
+increasing TURBOchannel slot numbers.
+
+The board only supports DMA on the receive side.  Transmission involves
+the use of PIO.  As a result under a heavy transmission load there will
+be a significant impact on system performance.
+
+The board supports a 64-entry CAM for matching destination addresses.
+Two entries are preoccupied by the Directed Beacon and Ring Purger
+multicast addresses and the rest is used as a multicast filter.  An
+all-multi mode is also supported for LLC frames and it is used if
+requested explicitly or if the CAM overflows.  The promiscuous mode
+supports separate enables for LLC and SMT frames, but this driver
+doesn't support changing them individually.
+
+
+Known problems:
+
+None.
+
+
+To do:
+
+5. MAC address change.  The card does not support changing the Media
+   Access Controller's address registers but a similar effect can be
+   achieved by adding an alias to the CAM.  There is no way to disable
+   matching against the original address though.
+
+7. Queueing incoming/outgoing SMT frames in the driver if the SMT
+   receive/RMC transmit ring is full. (?)
+
+8. Retrieving/reporting FDDI/SNMP stats.
+
+
+Both success and failure reports are welcome.
+
+Maciej W. Rozycki  <macro@linux-mips.org>
diff --git a/MAINTAINERS b/MAINTAINERS
index 6d5161def3f3..031127139f3b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4170,6 +4170,11 @@ S:	Maintained
 F:	drivers/platform/x86/dell-smbios-wmi.c
 F:	tools/wmi/dell-smbios-example.c
 
+DEFZA FDDI NETWORK DRIVER
+M:	"Maciej W. Rozycki" <macro@linux-mips.org>
+S:	Maintained
+F:	drivers/net/fddi/defza.*
+
 DELL LAPTOP DRIVER
 M:	Matthew Garrett <mjg59@srcf.ucam.org>
 M:	Pali Rohár <pali.rohar@gmail.com>
diff --git a/drivers/net/fddi/Kconfig b/drivers/net/fddi/Kconfig
index 3a424c864f4d..d62e8c6205f7 100644
--- a/drivers/net/fddi/Kconfig
+++ b/drivers/net/fddi/Kconfig
@@ -15,6 +15,17 @@ config FDDI
 
 if FDDI
 
+config DEFZA
+	tristate "DEC FDDIcontroller 700/700-C (DEFZA-xx) support"
+	depends on FDDI && TC
+	help
+	  This is support for the DEC FDDIcontroller 700 (DEFZA-AA, fiber)
+	  and 700-C (DEFZA-CA, copper) TURBOchannel network cards which
+	  can connect you to a local FDDI network.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called defza.  If unsure, say N.
+
 config DEFXX
 	tristate "Digital DEFTA/DEFEA/DEFPA adapter support"
 	depends on FDDI && (PCI || EISA || TC)
diff --git a/drivers/net/fddi/Makefile b/drivers/net/fddi/Makefile
index 36da19c9a8aa..194b52cc20b0 100644
--- a/drivers/net/fddi/Makefile
+++ b/drivers/net/fddi/Makefile
@@ -3,4 +3,5 @@
 #
 
 obj-$(CONFIG_DEFXX) += defxx.o
+obj-$(CONFIG_DEFZA) += defza.o
 obj-$(CONFIG_SKFP) += skfp/
diff --git a/drivers/net/fddi/defza.c b/drivers/net/fddi/defza.c
new file mode 100644
index 000000000000..7d01b70f7ed8
--- /dev/null
+++ b/drivers/net/fddi/defza.c
@@ -0,0 +1,1535 @@
+// SPDX-License-Identifier: GPL-2.0
+/*	FDDI network adapter driver for DEC FDDIcontroller 700/700-C devices.
+ *
+ *	Copyright (c) 2018  Maciej W. Rozycki
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	References:
+ *
+ *	Dave Sawyer & Phil Weeks & Frank Itkowsky,
+ *	"DEC FDDIcontroller 700 Port Specification",
+ *	Revision 1.1, Digital Equipment Corporation
+ */
+
+/* ------------------------------------------------------------------------- */
+/* FZA configurable parameters.                                              */
+
+/* The number of transmit ring descriptors; either 0 for 512 or 1 for 1024.  */
+#define FZA_RING_TX_MODE 0
+
+/* The number of receive ring descriptors; from 2 up to 256.  */
+#define FZA_RING_RX_SIZE 256
+
+/* End of FZA configurable parameters.  No need to change anything below.    */
+/* ------------------------------------------------------------------------- */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/tc.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include <asm/barrier.h>
+
+#include "defza.h"
+
+#define DRV_NAME "defza"
+#define DRV_VERSION "v.1.1.4"
+#define DRV_RELDATE "Oct  6 2018"
+
+static char version[] =
+	DRV_NAME ": " DRV_VERSION "  " DRV_RELDATE "  Maciej W. Rozycki\n";
+
+MODULE_AUTHOR("Maciej W. Rozycki <macro@linux-mips.org>");
+MODULE_DESCRIPTION("DEC FDDIcontroller 700 (DEFZA-xx) driver");
+MODULE_LICENSE("GPL");
+
+static int loopback;
+module_param(loopback, int, 0644);
+
+/* Ring Purger Multicast */
+static u8 hw_addr_purger[8] = { 0x09, 0x00, 0x2b, 0x02, 0x01, 0x05 };
+/* Directed Beacon Multicast */
+static u8 hw_addr_beacon[8] = { 0x01, 0x80, 0xc2, 0x00, 0x01, 0x00 };
+
+/* Shorthands for MMIO accesses that we require to be strongly ordered
+ * WRT preceding MMIO accesses.
+ */
+#define readw_o readw_relaxed
+#define readl_o readl_relaxed
+
+#define writew_o writew_relaxed
+#define writel_o writel_relaxed
+
+/* Shorthands for MMIO accesses that we are happy with being weakly ordered
+ * WRT preceding MMIO accesses.
+ */
+#define readw_u readw_relaxed
+#define readl_u readl_relaxed
+#define readq_u readq_relaxed
+
+#define writew_u writew_relaxed
+#define writel_u writel_relaxed
+#define writeq_u writeq_relaxed
+
+static inline struct sk_buff *fza_alloc_skb_irq(struct net_device *dev,
+						unsigned int length)
+{
+	return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
+}
+
+static inline struct sk_buff *fza_alloc_skb(struct net_device *dev,
+					    unsigned int length)
+{
+	return __netdev_alloc_skb(dev, length, GFP_KERNEL);
+}
+
+static inline void fza_skb_align(struct sk_buff *skb, unsigned int v)
+{
+	unsigned long x, y;
+
+	x = (unsigned long)skb->data;
+	y = ALIGN(x, v);
+
+	skb_reserve(skb, y - x);
+}
+
+static inline void fza_reads(const void __iomem *from, void *to,
+			     unsigned long size)
+{
+	if (sizeof(unsigned long) == 8) {
+		const u64 __iomem *src = from;
+		const u32 __iomem *src_trail;
+		u64 *dst = to;
+		u32 *dst_trail;
+
+		for (size = (size + 3) / 4; size > 1; size -= 2)
+			*dst++ = readq_u(src++);
+		if (size) {
+			src_trail = (u32 __iomem *)src;
+			dst_trail = (u32 *)dst;
+			*dst_trail = readl_u(src_trail);
+		}
+	} else {
+		const u32 __iomem *src = from;
+		u32 *dst = to;
+
+		for (size = (size + 3) / 4; size; size--)
+			*dst++ = readl_u(src++);
+	}
+}
+
+static inline void fza_writes(const void *from, void __iomem *to,
+			      unsigned long size)
+{
+	if (sizeof(unsigned long) == 8) {
+		const u64 *src = from;
+		const u32 *src_trail;
+		u64 __iomem *dst = to;
+		u32 __iomem *dst_trail;
+
+		for (size = (size + 3) / 4; size > 1; size -= 2)
+			writeq_u(*src++, dst++);
+		if (size) {
+			src_trail = (u32 *)src;
+			dst_trail = (u32 __iomem *)dst;
+			writel_u(*src_trail, dst_trail);
+		}
+	} else {
+		const u32 *src = from;
+		u32 __iomem *dst = to;
+
+		for (size = (size + 3) / 4; size; size--)
+			writel_u(*src++, dst++);
+	}
+}
+
+static inline void fza_moves(const void __iomem *from, void __iomem *to,
+			     unsigned long size)
+{
+	if (sizeof(unsigned long) == 8) {
+		const u64 __iomem *src = from;
+		const u32 __iomem *src_trail;
+		u64 __iomem *dst = to;
+		u32 __iomem *dst_trail;
+
+		for (size = (size + 3) / 4; size > 1; size -= 2)
+			writeq_u(readq_u(src++), dst++);
+		if (size) {
+			src_trail = (u32 __iomem *)src;
+			dst_trail = (u32 __iomem *)dst;
+			writel_u(readl_u(src_trail), dst_trail);
+		}
+	} else {
+		const u32 __iomem *src = from;
+		u32 __iomem *dst = to;
+
+		for (size = (size + 3) / 4; size; size--)
+			writel_u(readl_u(src++), dst++);
+	}
+}
+
+static inline void fza_zeros(void __iomem *to, unsigned long size)
+{
+	if (sizeof(unsigned long) == 8) {
+		u64 __iomem *dst = to;
+		u32 __iomem *dst_trail;
+
+		for (size = (size + 3) / 4; size > 1; size -= 2)
+			writeq_u(0, dst++);
+		if (size) {
+			dst_trail = (u32 __iomem *)dst;
+			writel_u(0, dst_trail);
+		}
+	} else {
+		u32 __iomem *dst = to;
+
+		for (size = (size + 3) / 4; size; size--)
+			writel_u(0, dst++);
+	}
+}
+
+static inline void fza_regs_dump(struct fza_private *fp)
+{
+	pr_debug("%s: iomem registers:\n", fp->name);
+	pr_debug(" reset:           0x%04x\n", readw_o(&fp->regs->reset));
+	pr_debug(" interrupt event: 0x%04x\n", readw_u(&fp->regs->int_event));
+	pr_debug(" status:          0x%04x\n", readw_u(&fp->regs->status));
+	pr_debug(" interrupt mask:  0x%04x\n", readw_u(&fp->regs->int_mask));
+	pr_debug(" control A:       0x%04x\n", readw_u(&fp->regs->control_a));
+	pr_debug(" control B:       0x%04x\n", readw_u(&fp->regs->control_b));
+}
+
+static inline void fza_do_reset(struct fza_private *fp)
+{
+	/* Reset the board. */
+	writew_o(FZA_RESET_INIT, &fp->regs->reset);
+	readw_o(&fp->regs->reset);	/* Synchronize. */
+	readw_o(&fp->regs->reset);	/* Read it back for a small delay. */
+	writew_o(FZA_RESET_CLR, &fp->regs->reset);
+
+	/* Enable all interrupt events we handle. */
+	writew_o(fp->int_mask, &fp->regs->int_mask);
+	readw_o(&fp->regs->int_mask);	/* Synchronize. */
+}
+
+static inline void fza_do_shutdown(struct fza_private *fp)
+{
+	/* Disable the driver mode. */
+	writew_o(FZA_CONTROL_B_IDLE, &fp->regs->control_b);
+
+	/* And reset the board. */
+	writew_o(FZA_RESET_INIT, &fp->regs->reset);
+	readw_o(&fp->regs->reset);	/* Synchronize. */
+	writew_o(FZA_RESET_CLR, &fp->regs->reset);
+	readw_o(&fp->regs->reset);	/* Synchronize. */
+}
+
+static int fza_reset(struct fza_private *fp)
+{
+	unsigned long flags;
+	uint status, state;
+	long t;
+
+	pr_info("%s: resetting the board...\n", fp->name);
+
+	spin_lock_irqsave(&fp->lock, flags);
+	fp->state_chg_flag = 0;
+	fza_do_reset(fp);
+	spin_unlock_irqrestore(&fp->lock, flags);
+
+	/* DEC says RESET needs up to 30 seconds to complete.  My DEFZA-AA
+	 * rev. C03 happily finishes in 9.7 seconds. :-)  But we need to
+	 * be on the safe side...
+	 */
+	t = wait_event_timeout(fp->state_chg_wait, fp->state_chg_flag,
+			       45 * HZ);
+	status = readw_u(&fp->regs->status);
+	state = FZA_STATUS_GET_STATE(status);
+	if (fp->state_chg_flag == 0) {
+		pr_err("%s: RESET timed out!, state %x\n", fp->name, state);
+		return -EIO;
+	}
+	if (state != FZA_STATE_UNINITIALIZED) {
+		pr_err("%s: RESET failed!, state %x, failure ID %x\n",
+		       fp->name, state, FZA_STATUS_GET_TEST(status));
+		return -EIO;
+	}
+	pr_info("%s: OK\n", fp->name);
+	pr_debug("%s: RESET: %lums elapsed\n", fp->name,
+		 (45 * HZ - t) * 1000 / HZ);
+
+	return 0;
+}
+
+static struct fza_ring_cmd __iomem *fza_cmd_send(struct net_device *dev,
+						 int command)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	struct fza_ring_cmd __iomem *ring = fp->ring_cmd + fp->ring_cmd_index;
+	unsigned int old_mask, new_mask;
+	union fza_cmd_buf __iomem *buf;
+	struct netdev_hw_addr *ha;
+	int i;
+
+	old_mask = fp->int_mask;
+	new_mask = old_mask & ~FZA_MASK_STATE_CHG;
+	writew_u(new_mask, &fp->regs->int_mask);
+	readw_o(&fp->regs->int_mask);			/* Synchronize. */
+	fp->int_mask = new_mask;
+
+	buf = fp->mmio + readl_u(&ring->buffer);
+
+	if ((readl_u(&ring->cmd_own) & FZA_RING_OWN_MASK) !=
+	    FZA_RING_OWN_HOST) {
+		pr_warn("%s: command buffer full, command: %u!\n", fp->name,
+			command);
+		return NULL;
+	}
+
+	switch (command) {
+	case FZA_RING_CMD_INIT:
+		writel_u(FZA_RING_TX_MODE, &buf->init.tx_mode);
+		writel_u(FZA_RING_RX_SIZE, &buf->init.hst_rx_size);
+		fza_zeros(&buf->init.counters, sizeof(buf->init.counters));
+		break;
+
+	case FZA_RING_CMD_MODCAM:
+		i = 0;
+		fza_writes(&hw_addr_purger, &buf->cam.hw_addr[i++],
+			   sizeof(*buf->cam.hw_addr));
+		fza_writes(&hw_addr_beacon, &buf->cam.hw_addr[i++],
+			   sizeof(*buf->cam.hw_addr));
+		netdev_for_each_mc_addr(ha, dev) {
+			if (i >= FZA_CMD_CAM_SIZE)
+				break;
+			fza_writes(ha->addr, &buf->cam.hw_addr[i++],
+				   sizeof(*buf->cam.hw_addr));
+		}
+		while (i < FZA_CMD_CAM_SIZE)
+			fza_zeros(&buf->cam.hw_addr[i++],
+				  sizeof(*buf->cam.hw_addr));
+		break;
+
+	case FZA_RING_CMD_PARAM:
+		writel_u(loopback, &buf->param.loop_mode);
+		writel_u(fp->t_max, &buf->param.t_max);
+		writel_u(fp->t_req, &buf->param.t_req);
+		writel_u(fp->tvx, &buf->param.tvx);
+		writel_u(fp->lem_threshold, &buf->param.lem_threshold);
+		fza_writes(&fp->station_id, &buf->param.station_id,
+			   sizeof(buf->param.station_id));
+		/* Convert to milliseconds due to buggy firmware. */
+		writel_u(fp->rtoken_timeout / 12500,
+			 &buf->param.rtoken_timeout);
+		writel_u(fp->ring_purger, &buf->param.ring_purger);
+		break;
+
+	case FZA_RING_CMD_MODPROM:
+		if (dev->flags & IFF_PROMISC) {
+			writel_u(1, &buf->modprom.llc_prom);
+			writel_u(1, &buf->modprom.smt_prom);
+		} else {
+			writel_u(0, &buf->modprom.llc_prom);
+			writel_u(0, &buf->modprom.smt_prom);
+		}
+		if (dev->flags & IFF_ALLMULTI ||
+		    netdev_mc_count(dev) > FZA_CMD_CAM_SIZE - 2)
+			writel_u(1, &buf->modprom.llc_multi);
+		else
+			writel_u(0, &buf->modprom.llc_multi);
+		writel_u(1, &buf->modprom.llc_bcast);
+		break;
+	}
+
+	/* Trigger the command. */
+	writel_u(FZA_RING_OWN_FZA | command, &ring->cmd_own);
+	writew_o(FZA_CONTROL_A_CMD_POLL, &fp->regs->control_a);
+
+	fp->ring_cmd_index = (fp->ring_cmd_index + 1) % FZA_RING_CMD_SIZE;
+
+	fp->int_mask = old_mask;
+	writew_u(fp->int_mask, &fp->regs->int_mask);
+
+	return ring;
+}
+
+static int fza_init_send(struct net_device *dev,
+			 struct fza_cmd_init *__iomem *init)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	struct fza_ring_cmd __iomem *ring;
+	unsigned long flags;
+	u32 stat;
+	long t;
+
+	spin_lock_irqsave(&fp->lock, flags);
+	fp->cmd_done_flag = 0;
+	ring = fza_cmd_send(dev, FZA_RING_CMD_INIT);
+	spin_unlock_irqrestore(&fp->lock, flags);
+	if (!ring)
+		/* This should never happen in the uninitialized state,
+		 * so do not try to recover and just consider it fatal.
+		 */
+		return -ENOBUFS;
+
+	/* INIT may take quite a long time (160ms for my C03). */
+	t = wait_event_timeout(fp->cmd_done_wait, fp->cmd_done_flag, 3 * HZ);
+	if (fp->cmd_done_flag == 0) {
+		pr_err("%s: INIT command timed out!, state %x\n", fp->name,
+		       FZA_STATUS_GET_STATE(readw_u(&fp->regs->status)));
+		return -EIO;
+	}
+	stat = readl_u(&ring->stat);
+	if (stat != FZA_RING_STAT_SUCCESS) {
+		pr_err("%s: INIT command failed!, status %02x, state %x\n",
+		       fp->name, stat,
+		       FZA_STATUS_GET_STATE(readw_u(&fp->regs->status)));
+		return -EIO;
+	}
+	pr_debug("%s: INIT: %lums elapsed\n", fp->name,
+		 (3 * HZ - t) * 1000 / HZ);
+
+	if (init)
+		*init = fp->mmio + readl_u(&ring->buffer);
+	return 0;
+}
+
+static void fza_rx_init(struct fza_private *fp)
+{
+	int i;
+
+	/* Fill the host receive descriptor ring. */
+	for (i = 0; i < FZA_RING_RX_SIZE; i++) {
+		writel_o(0, &fp->ring_hst_rx[i].rmc);
+		writel_o((fp->rx_dma[i] + 0x1000) >> 9,
+			 &fp->ring_hst_rx[i].buffer1);
+		writel_o(fp->rx_dma[i] >> 9 | FZA_RING_OWN_FZA,
+			 &fp->ring_hst_rx[i].buf0_own);
+	}
+}
+
+static void fza_set_rx_mode(struct net_device *dev)
+{
+	fza_cmd_send(dev, FZA_RING_CMD_MODCAM);
+	fza_cmd_send(dev, FZA_RING_CMD_MODPROM);
+}
+
+union fza_buffer_txp {
+	struct fza_buffer_tx *data_ptr;
+	struct fza_buffer_tx __iomem *mmio_ptr;
+};
+
+static int fza_do_xmit(union fza_buffer_txp ub, int len,
+		       struct net_device *dev, int smt)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	struct fza_buffer_tx __iomem *rmc_tx_ptr;
+	int i, first, frag_len, left_len;
+	u32 own, rmc;
+
+	if (((((fp->ring_rmc_txd_index - 1 + fp->ring_rmc_tx_size) -
+	       fp->ring_rmc_tx_index) % fp->ring_rmc_tx_size) *
+	     FZA_TX_BUFFER_SIZE) < len)
+		return 1;
+
+	first = fp->ring_rmc_tx_index;
+
+	left_len = len;
+	frag_len = FZA_TX_BUFFER_SIZE;
+	/* First descriptor is relinquished last. */
+	own = FZA_RING_TX_OWN_HOST;
+	/* First descriptor carries frame length; we don't use cut-through. */
+	rmc = FZA_RING_TX_SOP | FZA_RING_TX_VBC | len;
+	do {
+		i = fp->ring_rmc_tx_index;
+		rmc_tx_ptr = &fp->buffer_tx[i];
+
+		if (left_len < FZA_TX_BUFFER_SIZE)
+			frag_len = left_len;
+		left_len -= frag_len;
+
+		/* Length must be a multiple of 4 as only word writes are
+		 * permitted!
+		 */
+		frag_len = (frag_len + 3) & ~3;
+		if (smt)
+			fza_moves(ub.mmio_ptr, rmc_tx_ptr, frag_len);
+		else
+			fza_writes(ub.data_ptr, rmc_tx_ptr, frag_len);
+
+		if (left_len == 0)
+			rmc |= FZA_RING_TX_EOP;		/* Mark last frag. */
+
+		writel_o(rmc, &fp->ring_rmc_tx[i].rmc);
+		writel_o(own, &fp->ring_rmc_tx[i].own);
+
+		ub.data_ptr++;
+		fp->ring_rmc_tx_index = (fp->ring_rmc_tx_index + 1) %
+					fp->ring_rmc_tx_size;
+
+		/* Settings for intermediate frags. */
+		own = FZA_RING_TX_OWN_RMC;
+		rmc = 0;
+	} while (left_len > 0);
+
+	if (((((fp->ring_rmc_txd_index - 1 + fp->ring_rmc_tx_size) -
+	       fp->ring_rmc_tx_index) % fp->ring_rmc_tx_size) *
+	     FZA_TX_BUFFER_SIZE) < dev->mtu + dev->hard_header_len) {
+		netif_stop_queue(dev);
+		pr_debug("%s: queue stopped\n", fp->name);
+	}
+
+	writel_o(FZA_RING_TX_OWN_RMC, &fp->ring_rmc_tx[first].own);
+
+	/* Go, go, go! */
+	writew_o(FZA_CONTROL_A_TX_POLL, &fp->regs->control_a);
+
+	return 0;
+}
+
+static int fza_do_recv_smt(struct fza_buffer_tx *data_ptr, int len,
+			   u32 rmc, struct net_device *dev)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	struct fza_buffer_tx __iomem *smt_rx_ptr;
+	u32 own;
+	int i;
+
+	i = fp->ring_smt_rx_index;
+	own = readl_o(&fp->ring_smt_rx[i].own);
+	if ((own & FZA_RING_OWN_MASK) == FZA_RING_OWN_FZA)
+		return 1;
+
+	smt_rx_ptr = fp->mmio + readl_u(&fp->ring_smt_rx[i].buffer);
+
+	/* Length must be a multiple of 4 as only word writes are permitted! */
+	fza_writes(data_ptr, smt_rx_ptr, (len + 3) & ~3);
+
+	writel_o(rmc, &fp->ring_smt_rx[i].rmc);
+	writel_o(FZA_RING_OWN_FZA, &fp->ring_smt_rx[i].own);
+
+	fp->ring_smt_rx_index =
+		(fp->ring_smt_rx_index + 1) % fp->ring_smt_rx_size;
+
+	/* Grab it! */
+	writew_o(FZA_CONTROL_A_SMT_RX_POLL, &fp->regs->control_a);
+
+	return 0;
+}
+
+static void fza_tx(struct net_device *dev)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	u32 own, rmc;
+	int i;
+
+	while (1) {
+		i = fp->ring_rmc_txd_index;
+		if (i == fp->ring_rmc_tx_index)
+			break;
+		own = readl_o(&fp->ring_rmc_tx[i].own);
+		if ((own & FZA_RING_OWN_MASK) == FZA_RING_TX_OWN_RMC)
+			break;
+
+		rmc = readl_u(&fp->ring_rmc_tx[i].rmc);
+		/* Only process the first descriptor. */
+		if ((rmc & FZA_RING_TX_SOP) != 0) {
+			if ((rmc & FZA_RING_TX_DCC_MASK) ==
+			    FZA_RING_TX_DCC_SUCCESS) {
+				int pkt_len = (rmc & FZA_RING_PBC_MASK) - 3;
+								/* Omit PRH. */
+
+				fp->stats.tx_packets++;
+				fp->stats.tx_bytes += pkt_len;
+			} else {
+				fp->stats.tx_errors++;
+				switch (rmc & FZA_RING_TX_DCC_MASK) {
+				case FZA_RING_TX_DCC_DTP_SOP:
+				case FZA_RING_TX_DCC_DTP:
+				case FZA_RING_TX_DCC_ABORT:
+					fp->stats.tx_aborted_errors++;
+					break;
+				case FZA_RING_TX_DCC_UNDRRUN:
+					fp->stats.tx_fifo_errors++;
+					break;
+				case FZA_RING_TX_DCC_PARITY:
+				default:
+					break;
+				}
+			}
+		}
+
+		fp->ring_rmc_txd_index = (fp->ring_rmc_txd_index + 1) %
+					 fp->ring_rmc_tx_size;
+	}
+
+	if (((((fp->ring_rmc_txd_index - 1 + fp->ring_rmc_tx_size) -
+	       fp->ring_rmc_tx_index) % fp->ring_rmc_tx_size) *
+	     FZA_TX_BUFFER_SIZE) >= dev->mtu + dev->hard_header_len) {
+		if (fp->queue_active) {
+			netif_wake_queue(dev);
+			pr_debug("%s: queue woken\n", fp->name);
+		}
+	}
+}
+
+static inline int fza_rx_err(struct fza_private *fp,
+			     const u32 rmc, const u8 fc)
+{
+	int len, min_len, max_len;
+
+	len = rmc & FZA_RING_PBC_MASK;
+
+	if (unlikely((rmc & FZA_RING_RX_BAD) != 0)) {
+		fp->stats.rx_errors++;
+
+		/* Check special status codes. */
+		if ((rmc & (FZA_RING_RX_CRC | FZA_RING_RX_RRR_MASK |
+			    FZA_RING_RX_DA_MASK | FZA_RING_RX_SA_MASK)) ==
+		     (FZA_RING_RX_CRC | FZA_RING_RX_RRR_DADDR |
+		      FZA_RING_RX_DA_CAM | FZA_RING_RX_SA_ALIAS)) {
+			if (len >= 8190)
+				fp->stats.rx_length_errors++;
+			return 1;
+		}
+		if ((rmc & (FZA_RING_RX_CRC | FZA_RING_RX_RRR_MASK |
+			    FZA_RING_RX_DA_MASK | FZA_RING_RX_SA_MASK)) ==
+		     (FZA_RING_RX_CRC | FZA_RING_RX_RRR_DADDR |
+		      FZA_RING_RX_DA_CAM | FZA_RING_RX_SA_CAM)) {
+			/* Halt the interface to trigger a reset. */
+			writew_o(FZA_CONTROL_A_HALT, &fp->regs->control_a);
+			readw_o(&fp->regs->control_a);	/* Synchronize. */
+			return 1;
+		}
+
+		/* Check the MAC status. */
+		switch (rmc & FZA_RING_RX_RRR_MASK) {
+		case FZA_RING_RX_RRR_OK:
+			if ((rmc & FZA_RING_RX_CRC) != 0)
+				fp->stats.rx_crc_errors++;
+			else if ((rmc & FZA_RING_RX_FSC_MASK) == 0 ||
+				 (rmc & FZA_RING_RX_FSB_ERR) != 0)
+				fp->stats.rx_frame_errors++;
+			return 1;
+		case FZA_RING_RX_RRR_SADDR:
+		case FZA_RING_RX_RRR_DADDR:
+		case FZA_RING_RX_RRR_ABORT:
+			/* Halt the interface to trigger a reset. */
+			writew_o(FZA_CONTROL_A_HALT, &fp->regs->control_a);
+			readw_o(&fp->regs->control_a);	/* Synchronize. */
+			return 1;
+		case FZA_RING_RX_RRR_LENGTH:
+			fp->stats.rx_frame_errors++;
+			return 1;
+		default:
+			return 1;
+		}
+	}
+
+	/* Packet received successfully; validate the length. */
+	switch (fc & FDDI_FC_K_FORMAT_MASK) {
+	case FDDI_FC_K_FORMAT_MANAGEMENT:
+		if ((fc & FDDI_FC_K_CLASS_MASK) == FDDI_FC_K_CLASS_ASYNC)
+			min_len = 37;
+		else
+			min_len = 17;
+		break;
+	case FDDI_FC_K_FORMAT_LLC:
+		min_len = 20;
+		break;
+	default:
+		min_len = 17;
+		break;
+	}
+	max_len = 4495;
+	if (len < min_len || len > max_len) {
+		fp->stats.rx_errors++;
+		fp->stats.rx_length_errors++;
+		return 1;
+	}
+
+	return 0;
+}
+
+static void fza_rx(struct net_device *dev)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	struct sk_buff *skb, *newskb;
+	struct fza_fddihdr *frame;
+	dma_addr_t dma, newdma;
+	u32 own, rmc, buf;
+	int i, len;
+	u8 fc;
+
+	while (1) {
+		i = fp->ring_hst_rx_index;
+		own = readl_o(&fp->ring_hst_rx[i].buf0_own);
+		if ((own & FZA_RING_OWN_MASK) == FZA_RING_OWN_FZA)
+			break;
+
+		rmc = readl_u(&fp->ring_hst_rx[i].rmc);
+		skb = fp->rx_skbuff[i];
+		dma = fp->rx_dma[i];
+
+		/* The RMC doesn't count the preamble and the starting
+		 * delimiter.  We fix it up here for a total of 3 octets.
+		 */
+		dma_rmb();
+		len = (rmc & FZA_RING_PBC_MASK) + 3;
+		frame = (struct fza_fddihdr *)skb->data;
+
+		/* We need to get at real FC. */
+		dma_sync_single_for_cpu(fp->bdev,
+					dma +
+					((u8 *)&frame->hdr.fc - (u8 *)frame),
+					sizeof(frame->hdr.fc),
+					DMA_FROM_DEVICE);
+		fc = frame->hdr.fc;
+
+		if (fza_rx_err(fp, rmc, fc))
+			goto err_rx;
+
+		/* We have to 512-byte-align RX buffers... */
+		newskb = fza_alloc_skb_irq(dev, FZA_RX_BUFFER_SIZE + 511);
+		if (newskb) {
+			fza_skb_align(newskb, 512);
+			newdma = dma_map_single(fp->bdev, newskb->data,
+						FZA_RX_BUFFER_SIZE,
+						DMA_FROM_DEVICE);
+			if (dma_mapping_error(fp->bdev, newdma)) {
+				dev_kfree_skb_irq(newskb);
+				newskb = NULL;
+			}
+		}
+		if (newskb) {
+			int pkt_len = len - 7;	/* Omit P, SD and FCS. */
+			int is_multi;
+			int rx_stat;
+
+			dma_unmap_single(fp->bdev, dma, FZA_RX_BUFFER_SIZE,
+					 DMA_FROM_DEVICE);
+
+			/* Queue SMT frames to the SMT receive ring. */
+			if ((fc & (FDDI_FC_K_CLASS_MASK |
+				   FDDI_FC_K_FORMAT_MASK)) ==
+			     (FDDI_FC_K_CLASS_ASYNC |
+			      FDDI_FC_K_FORMAT_MANAGEMENT) &&
+			    (rmc & FZA_RING_RX_DA_MASK) !=
+			     FZA_RING_RX_DA_PROM) {
+				if (fza_do_recv_smt((struct fza_buffer_tx *)
+						    skb->data, len, rmc,
+						    dev)) {
+					writel_o(FZA_CONTROL_A_SMT_RX_OVFL,
+						 &fp->regs->control_a);
+				}
+			}
+
+			is_multi = ((frame->hdr.daddr[0] & 0x01) != 0);
+
+			skb_reserve(skb, 3);	/* Skip over P and SD. */
+			skb_put(skb, pkt_len);	/* And cut off FCS. */
+			skb->protocol = fddi_type_trans(skb, dev);
+
+			rx_stat = netif_rx(skb);
+			if (rx_stat != NET_RX_DROP) {
+				fp->stats.rx_packets++;
+				fp->stats.rx_bytes += pkt_len;
+				if (is_multi)
+					fp->stats.multicast++;
+			} else {
+				fp->stats.rx_dropped++;
+			}
+
+			skb = newskb;
+			dma = newdma;
+			fp->rx_skbuff[i] = skb;
+			fp->rx_dma[i] = dma;
+		} else {
+			fp->stats.rx_dropped++;
+			pr_notice("%s: memory squeeze, dropping packet\n",
+				  fp->name);
+		}
+
+err_rx:
+		writel_o(0, &fp->ring_hst_rx[i].rmc);
+		buf = (dma + 0x1000) >> 9;
+		writel_o(buf, &fp->ring_hst_rx[i].buffer1);
+		buf = dma >> 9 | FZA_RING_OWN_FZA;
+		writel_o(buf, &fp->ring_hst_rx[i].buf0_own);
+		fp->ring_hst_rx_index =
+			(fp->ring_hst_rx_index + 1) % fp->ring_hst_rx_size;
+	}
+}
+
+static void fza_tx_smt(struct net_device *dev)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	struct fza_buffer_tx __iomem *smt_tx_ptr, *skb_data_ptr;
+	int i, len;
+	u32 own;
+
+	while (1) {
+		i = fp->ring_smt_tx_index;
+		own = readl_o(&fp->ring_smt_tx[i].own);
+		if ((own & FZA_RING_OWN_MASK) == FZA_RING_OWN_FZA)
+			break;
+
+		smt_tx_ptr = fp->mmio + readl_u(&fp->ring_smt_tx[i].buffer);
+		len = readl_u(&fp->ring_smt_tx[i].rmc) & FZA_RING_PBC_MASK;
+
+		/* Queue the frame to the RMC transmit ring. */
+		if (!netif_queue_stopped(dev))
+			fza_do_xmit((union fza_buffer_txp)
+				    { .mmio_ptr = smt_tx_ptr },
+				    len, dev, 1);
+
+		writel_o(FZA_RING_OWN_FZA, &fp->ring_smt_tx[i].own);
+		fp->ring_smt_tx_index =
+			(fp->ring_smt_tx_index + 1) % fp->ring_smt_tx_size;
+	}
+}
+
+static void fza_uns(struct net_device *dev)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	u32 own;
+	int i;
+
+	while (1) {
+		i = fp->ring_uns_index;
+		own = readl_o(&fp->ring_uns[i].own);
+		if ((own & FZA_RING_OWN_MASK) == FZA_RING_OWN_FZA)
+			break;
+
+		if (readl_u(&fp->ring_uns[i].id) == FZA_RING_UNS_RX_OVER) {
+			fp->stats.rx_errors++;
+			fp->stats.rx_over_errors++;
+		}
+
+		writel_o(FZA_RING_OWN_FZA, &fp->ring_uns[i].own);
+		fp->ring_uns_index =
+			(fp->ring_uns_index + 1) % FZA_RING_UNS_SIZE;
+	}
+}
+
+static void fza_tx_flush(struct net_device *dev)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	u32 own;
+	int i;
+
+	/* Clean up the SMT TX ring. */
+	i = fp->ring_smt_tx_index;
+	do {
+		writel_o(FZA_RING_OWN_FZA, &fp->ring_smt_tx[i].own);
+		fp->ring_smt_tx_index =
+			(fp->ring_smt_tx_index + 1) % fp->ring_smt_tx_size;
+
+	} while (i != fp->ring_smt_tx_index);
+
+	/* Clean up the RMC TX ring. */
+	i = fp->ring_rmc_tx_index;
+	do {
+		own = readl_o(&fp->ring_rmc_tx[i].own);
+		if ((own & FZA_RING_OWN_MASK) == FZA_RING_TX_OWN_RMC) {
+			u32 rmc = readl_u(&fp->ring_rmc_tx[i].rmc);
+
+			writel_u(rmc | FZA_RING_TX_DTP,
+				 &fp->ring_rmc_tx[i].rmc);
+		}
+		fp->ring_rmc_tx_index =
+			(fp->ring_rmc_tx_index + 1) % fp->ring_rmc_tx_size;
+
+	} while (i != fp->ring_rmc_tx_index);
+
+	/* Done. */
+	writew_o(FZA_CONTROL_A_FLUSH_DONE, &fp->regs->control_a);
+}
+
+static irqreturn_t fza_interrupt(int irq, void *dev_id)
+{
+	struct net_device *dev = dev_id;
+	struct fza_private *fp = netdev_priv(dev);
+	uint int_event;
+
+	/* Get interrupt events. */
+	int_event = readw_o(&fp->regs->int_event) & fp->int_mask;
+	if (int_event == 0)
+		return IRQ_NONE;
+
+	/* Clear the events. */
+	writew_u(int_event, &fp->regs->int_event);
+
+	/* Now handle the events.  The order matters. */
+
+	/* Command finished interrupt. */
+	if ((int_event & FZA_EVENT_CMD_DONE) != 0) {
+		fp->irq_count_cmd_done++;
+
+		spin_lock(&fp->lock);
+		fp->cmd_done_flag = 1;
+		wake_up(&fp->cmd_done_wait);
+		spin_unlock(&fp->lock);
+	}
+
+	/* Transmit finished interrupt. */
+	if ((int_event & FZA_EVENT_TX_DONE) != 0) {
+		fp->irq_count_tx_done++;
+		fza_tx(dev);
+	}
+
+	/* Host receive interrupt. */
+	if ((int_event & FZA_EVENT_RX_POLL) != 0) {
+		fp->irq_count_rx_poll++;
+		fza_rx(dev);
+	}
+
+	/* SMT transmit interrupt. */
+	if ((int_event & FZA_EVENT_SMT_TX_POLL) != 0) {
+		fp->irq_count_smt_tx_poll++;
+		fza_tx_smt(dev);
+	}
+
+	/* Transmit ring flush request. */
+	if ((int_event & FZA_EVENT_FLUSH_TX) != 0) {
+		fp->irq_count_flush_tx++;
+		fza_tx_flush(dev);
+	}
+
+	/* Link status change interrupt. */
+	if ((int_event & FZA_EVENT_LINK_ST_CHG) != 0) {
+		uint status;
+
+		fp->irq_count_link_st_chg++;
+		status = readw_u(&fp->regs->status);
+		if (FZA_STATUS_GET_LINK(status) == FZA_LINK_ON) {
+			netif_carrier_on(dev);
+			pr_info("%s: link available\n", fp->name);
+		} else {
+			netif_carrier_off(dev);
+			pr_info("%s: link unavailable\n", fp->name);
+		}
+	}
+
+	/* Unsolicited event interrupt. */
+	if ((int_event & FZA_EVENT_UNS_POLL) != 0) {
+		fp->irq_count_uns_poll++;
+		fza_uns(dev);
+	}
+
+	/* State change interrupt. */
+	if ((int_event & FZA_EVENT_STATE_CHG) != 0) {
+		uint status, state;
+
+		fp->irq_count_state_chg++;
+
+		status = readw_u(&fp->regs->status);
+		state = FZA_STATUS_GET_STATE(status);
+		pr_debug("%s: state change: %x\n", fp->name, state);
+		switch (state) {
+		case FZA_STATE_RESET:
+			break;
+
+		case FZA_STATE_UNINITIALIZED:
+			netif_carrier_off(dev);
+			del_timer_sync(&fp->reset_timer);
+			fp->ring_cmd_index = 0;
+			fp->ring_uns_index = 0;
+			fp->ring_rmc_tx_index = 0;
+			fp->ring_rmc_txd_index = 0;
+			fp->ring_hst_rx_index = 0;
+			fp->ring_smt_tx_index = 0;
+			fp->ring_smt_rx_index = 0;
+			if (fp->state > state) {
+				pr_info("%s: OK\n", fp->name);
+				fza_cmd_send(dev, FZA_RING_CMD_INIT);
+			}
+			break;
+
+		case FZA_STATE_INITIALIZED:
+			if (fp->state > state) {
+				fza_set_rx_mode(dev);
+				fza_cmd_send(dev, FZA_RING_CMD_PARAM);
+			}
+			break;
+
+		case FZA_STATE_RUNNING:
+		case FZA_STATE_MAINTENANCE:
+			fp->state = state;
+			fza_rx_init(fp);
+			fp->queue_active = 1;
+			netif_wake_queue(dev);
+			pr_debug("%s: queue woken\n", fp->name);
+			break;
+
+		case FZA_STATE_HALTED:
+			fp->queue_active = 0;
+			netif_stop_queue(dev);
+			pr_debug("%s: queue stopped\n", fp->name);
+			del_timer_sync(&fp->reset_timer);
+			pr_warn("%s: halted, reason: %x\n", fp->name,
+				FZA_STATUS_GET_HALT(status));
+			fza_regs_dump(fp);
+			pr_info("%s: resetting the board...\n", fp->name);
+			fza_do_reset(fp);
+			fp->timer_state = 0;
+			fp->reset_timer.expires = jiffies + 45 * HZ;
+			add_timer(&fp->reset_timer);
+			break;
+
+		default:
+			pr_warn("%s: undefined state: %x\n", fp->name, state);
+			break;
+		}
+
+		spin_lock(&fp->lock);
+		fp->state_chg_flag = 1;
+		wake_up(&fp->state_chg_wait);
+		spin_unlock(&fp->lock);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void fza_reset_timer(struct timer_list *t)
+{
+	struct fza_private *fp = from_timer(fp, t, reset_timer);
+
+	if (!fp->timer_state) {
+		pr_err("%s: RESET timed out!\n", fp->name);
+		pr_info("%s: trying harder...\n", fp->name);
+
+		/* Assert the board reset. */
+		writew_o(FZA_RESET_INIT, &fp->regs->reset);
+		readw_o(&fp->regs->reset);		/* Synchronize. */
+
+		fp->timer_state = 1;
+		fp->reset_timer.expires = jiffies + HZ;
+	} else {
+		/* Clear the board reset. */
+		writew_u(FZA_RESET_CLR, &fp->regs->reset);
+
+		/* Enable all interrupt events we handle. */
+		writew_o(fp->int_mask, &fp->regs->int_mask);
+		readw_o(&fp->regs->int_mask);		/* Synchronize. */
+
+		fp->timer_state = 0;
+		fp->reset_timer.expires = jiffies + 45 * HZ;
+	}
+	add_timer(&fp->reset_timer);
+}
+
+static int fza_set_mac_address(struct net_device *dev, void *addr)
+{
+	return -EOPNOTSUPP;
+}
+
+static netdev_tx_t fza_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	unsigned int old_mask, new_mask;
+	int ret;
+	u8 fc;
+
+	skb_push(skb, 3);			/* Make room for PRH. */
+
+	/* Decode FC to set PRH. */
+	fc = skb->data[3];
+	skb->data[0] = 0;
+	skb->data[1] = 0;
+	skb->data[2] = FZA_PRH2_NORMAL;
+	if ((fc & FDDI_FC_K_CLASS_MASK) == FDDI_FC_K_CLASS_SYNC)
+		skb->data[0] |= FZA_PRH0_FRAME_SYNC;
+	switch (fc & FDDI_FC_K_FORMAT_MASK) {
+	case FDDI_FC_K_FORMAT_MANAGEMENT:
+		if ((fc & FDDI_FC_K_CONTROL_MASK) == 0) {
+			/* Token. */
+			skb->data[0] |= FZA_PRH0_TKN_TYPE_IMM;
+			skb->data[1] |= FZA_PRH1_TKN_SEND_NONE;
+		} else {
+			/* SMT or MAC. */
+			skb->data[0] |= FZA_PRH0_TKN_TYPE_UNR;
+			skb->data[1] |= FZA_PRH1_TKN_SEND_UNR;
+		}
+		skb->data[1] |= FZA_PRH1_CRC_NORMAL;
+		break;
+	case FDDI_FC_K_FORMAT_LLC:
+	case FDDI_FC_K_FORMAT_FUTURE:
+		skb->data[0] |= FZA_PRH0_TKN_TYPE_UNR;
+		skb->data[1] |= FZA_PRH1_CRC_NORMAL | FZA_PRH1_TKN_SEND_UNR;
+		break;
+	case FDDI_FC_K_FORMAT_IMPLEMENTOR:
+		skb->data[0] |= FZA_PRH0_TKN_TYPE_UNR;
+		skb->data[1] |= FZA_PRH1_TKN_SEND_ORIG;
+		break;
+	}
+
+	/* SMT transmit interrupts may sneak frames into the RMC
+	 * transmit ring.  We disable them while queueing a frame
+	 * to maintain consistency.
+	 */
+	old_mask = fp->int_mask;
+	new_mask = old_mask & ~FZA_MASK_SMT_TX_POLL;
+	writew_u(new_mask, &fp->regs->int_mask);
+	readw_o(&fp->regs->int_mask);			/* Synchronize. */
+	fp->int_mask = new_mask;
+	ret = fza_do_xmit((union fza_buffer_txp)
+			  { .data_ptr = (struct fza_buffer_tx *)skb->data },
+			  skb->len, dev, 0);
+	fp->int_mask = old_mask;
+	writew_u(fp->int_mask, &fp->regs->int_mask);
+
+	if (ret) {
+		/* Probably an SMT packet filled the remaining space,
+		 * so just stop the queue, but don't report it as an error.
+		 */
+		netif_stop_queue(dev);
+		pr_debug("%s: queue stopped\n", fp->name);
+		fp->stats.tx_dropped++;
+	}
+
+	dev_kfree_skb(skb);
+
+	return ret;
+}
+
+static int fza_open(struct net_device *dev)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	struct fza_ring_cmd __iomem *ring;
+	struct sk_buff *skb;
+	unsigned long flags;
+	dma_addr_t dma;
+	int ret, i;
+	u32 stat;
+	long t;
+
+	for (i = 0; i < FZA_RING_RX_SIZE; i++) {
+		/* We have to 512-byte-align RX buffers... */
+		skb = fza_alloc_skb(dev, FZA_RX_BUFFER_SIZE + 511);
+		if (skb) {
+			fza_skb_align(skb, 512);
+			dma = dma_map_single(fp->bdev, skb->data,
+					     FZA_RX_BUFFER_SIZE,
+					     DMA_FROM_DEVICE);
+			if (dma_mapping_error(fp->bdev, dma)) {
+				dev_kfree_skb(skb);
+				skb = NULL;
+			}
+		}
+		if (!skb) {
+			for (--i; i >= 0; i--) {
+				dma_unmap_single(fp->bdev, fp->rx_dma[i],
+						 FZA_RX_BUFFER_SIZE,
+						 DMA_FROM_DEVICE);
+				dev_kfree_skb(fp->rx_skbuff[i]);
+				fp->rx_dma[i] = 0;
+				fp->rx_skbuff[i] = NULL;
+			}
+			return -ENOMEM;
+		}
+		fp->rx_skbuff[i] = skb;
+		fp->rx_dma[i] = dma;
+	}
+
+	ret = fza_init_send(dev, NULL);
+	if (ret != 0)
+		return ret;
+
+	/* Purger and Beacon multicasts need to be supplied before PARAM. */
+	fza_set_rx_mode(dev);
+
+	spin_lock_irqsave(&fp->lock, flags);
+	fp->cmd_done_flag = 0;
+	ring = fza_cmd_send(dev, FZA_RING_CMD_PARAM);
+	spin_unlock_irqrestore(&fp->lock, flags);
+	if (!ring)
+		return -ENOBUFS;
+
+	t = wait_event_timeout(fp->cmd_done_wait, fp->cmd_done_flag, 3 * HZ);
+	if (fp->cmd_done_flag == 0) {
+		pr_err("%s: PARAM command timed out!, state %x\n", fp->name,
+		       FZA_STATUS_GET_STATE(readw_u(&fp->regs->status)));
+		return -EIO;
+	}
+	stat = readl_u(&ring->stat);
+	if (stat != FZA_RING_STAT_SUCCESS) {
+		pr_err("%s: PARAM command failed!, status %02x, state %x\n",
+		       fp->name, stat,
+		       FZA_STATUS_GET_STATE(readw_u(&fp->regs->status)));
+		return -EIO;
+	}
+	pr_debug("%s: PARAM: %lums elapsed\n", fp->name,
+		 (3 * HZ - t) * 1000 / HZ);
+
+	return 0;
+}
+
+static int fza_close(struct net_device *dev)
+{
+	struct fza_private *fp = netdev_priv(dev);
+	unsigned long flags;
+	uint state;
+	long t;
+	int i;
+
+	netif_stop_queue(dev);
+	pr_debug("%s: queue stopped\n", fp->name);
+
+	del_timer_sync(&fp->reset_timer);
+	spin_lock_irqsave(&fp->lock, flags);
+	fp->state = FZA_STATE_UNINITIALIZED;
+	fp->state_chg_flag = 0;
+	/* Shut the interface down. */
+	writew_o(FZA_CONTROL_A_SHUT, &fp->regs->control_a);
+	readw_o(&fp->regs->control_a);			/* Synchronize. */
+	spin_unlock_irqrestore(&fp->lock, flags);
+
+	/* DEC says SHUT needs up to 10 seconds to complete. */
+	t = wait_event_timeout(fp->state_chg_wait, fp->state_chg_flag,
+			       15 * HZ);
+	state = FZA_STATUS_GET_STATE(readw_o(&fp->regs->status));
+	if (fp->state_chg_flag == 0) {
+		pr_err("%s: SHUT timed out!, state %x\n", fp->name, state);
+		return -EIO;
+	}
+	if (state != FZA_STATE_UNINITIALIZED) {
+		pr_err("%s: SHUT failed!, state %x\n", fp->name, state);
+		return -EIO;
+	}
+	pr_debug("%s: SHUT: %lums elapsed\n", fp->name,
+		 (15 * HZ - t) * 1000 / HZ);
+
+	for (i = 0; i < FZA_RING_RX_SIZE; i++)
+		if (fp->rx_skbuff[i]) {
+			dma_unmap_single(fp->bdev, fp->rx_dma[i],
+					 FZA_RX_BUFFER_SIZE, DMA_FROM_DEVICE);
+			dev_kfree_skb(fp->rx_skbuff[i]);
+			fp->rx_dma[i] = 0;
+			fp->rx_skbuff[i] = NULL;
+		}
+
+	return 0;
+}
+
+static struct net_device_stats *fza_get_stats(struct net_device *dev)
+{
+	struct fza_private *fp = netdev_priv(dev);
+
+	return &fp->stats;
+}
+
+static int fza_probe(struct device *bdev)
+{
+	static const struct net_device_ops netdev_ops = {
+		.ndo_open = fza_open,
+		.ndo_stop = fza_close,
+		.ndo_start_xmit = fza_start_xmit,
+		.ndo_set_rx_mode = fza_set_rx_mode,
+		.ndo_set_mac_address = fza_set_mac_address,
+		.ndo_get_stats = fza_get_stats,
+	};
+	static int version_printed;
+	char rom_rev[4], fw_rev[4], rmc_rev[4];
+	struct tc_dev *tdev = to_tc_dev(bdev);
+	struct fza_cmd_init __iomem *init;
+	resource_size_t start, len;
+	struct net_device *dev;
+	struct fza_private *fp;
+	uint smt_ver, pmd_type;
+	void __iomem *mmio;
+	uint hw_addr[2];
+	int ret, i;
+
+	if (!version_printed) {
+		pr_info("%s", version);
+		version_printed = 1;
+	}
+
+	dev = alloc_fddidev(sizeof(*fp));
+	if (!dev)
+		return -ENOMEM;
+	SET_NETDEV_DEV(dev, bdev);
+
+	fp = netdev_priv(dev);
+	dev_set_drvdata(bdev, dev);
+
+	fp->bdev = bdev;
+	fp->name = dev_name(bdev);
+
+	/* Request the I/O MEM resource. */
+	start = tdev->resource.start;
+	len = tdev->resource.end - start + 1;
+	if (!request_mem_region(start, len, dev_name(bdev))) {
+		pr_err("%s: cannot reserve MMIO region\n", fp->name);
+		ret = -EBUSY;
+		goto err_out_kfree;
+	}
+
+	/* MMIO mapping setup. */
+	mmio = ioremap_nocache(start, len);
+	if (!mmio) {
+		pr_err("%s: cannot map MMIO\n", fp->name);
+		ret = -ENOMEM;
+		goto err_out_resource;
+	}
+
+	/* Initialize the new device structure. */
+	switch (loopback) {
+	case FZA_LOOP_NORMAL:
+	case FZA_LOOP_INTERN:
+	case FZA_LOOP_EXTERN:
+		break;
+	default:
+		loopback = FZA_LOOP_NORMAL;
+	}
+
+	fp->mmio = mmio;
+	dev->irq = tdev->interrupt;
+
+	pr_info("%s: DEC FDDIcontroller 700 or 700-C at 0x%08llx, irq %d\n",
+		fp->name, (long long)tdev->resource.start, dev->irq);
+	pr_debug("%s: mapped at: 0x%p\n", fp->name, mmio);
+
+	fp->regs = mmio + FZA_REG_BASE;
+	fp->ring_cmd = mmio + FZA_RING_CMD;
+	fp->ring_uns = mmio + FZA_RING_UNS;
+
+	init_waitqueue_head(&fp->state_chg_wait);
+	init_waitqueue_head(&fp->cmd_done_wait);
+	spin_lock_init(&fp->lock);
+	fp->int_mask = FZA_MASK_NORMAL;
+
+	timer_setup(&fp->reset_timer, fza_reset_timer, 0);
+
+	/* Sanitize the board. */
+	fza_regs_dump(fp);
+	fza_do_shutdown(fp);
+
+	ret = request_irq(dev->irq, fza_interrupt, IRQF_SHARED, fp->name, dev);
+	if (ret != 0) {
+		pr_err("%s: unable to get IRQ %d!\n", fp->name, dev->irq);
+		goto err_out_map;
+	}
+
+	/* Enable the driver mode. */
+	writew_o(FZA_CONTROL_B_DRIVER, &fp->regs->control_b);
+
+	/* For some reason transmit done interrupts can trigger during
+	 * reset.  This avoids a division error in the handler.
+	 */
+	fp->ring_rmc_tx_size = FZA_RING_TX_SIZE;
+
+	ret = fza_reset(fp);
+	if (ret != 0)
+		goto err_out_irq;
+
+	ret = fza_init_send(dev, &init);
+	if (ret != 0)
+		goto err_out_irq;
+
+	fza_reads(&init->hw_addr, &hw_addr, sizeof(hw_addr));
+	memcpy(dev->dev_addr, &hw_addr, FDDI_K_ALEN);
+
+	fza_reads(&init->rom_rev, &rom_rev, sizeof(rom_rev));
+	fza_reads(&init->fw_rev, &fw_rev, sizeof(fw_rev));
+	fza_reads(&init->rmc_rev, &rmc_rev, sizeof(rmc_rev));
+	for (i = 3; i >= 0 && rom_rev[i] == ' '; i--)
+		rom_rev[i] = 0;
+	for (i = 3; i >= 0 && fw_rev[i] == ' '; i--)
+		fw_rev[i] = 0;
+	for (i = 3; i >= 0 && rmc_rev[i] == ' '; i--)
+		rmc_rev[i] = 0;
+
+	fp->ring_rmc_tx = mmio + readl_u(&init->rmc_tx);
+	fp->ring_rmc_tx_size = readl_u(&init->rmc_tx_size);
+	fp->ring_hst_rx = mmio + readl_u(&init->hst_rx);
+	fp->ring_hst_rx_size = readl_u(&init->hst_rx_size);
+	fp->ring_smt_tx = mmio + readl_u(&init->smt_tx);
+	fp->ring_smt_tx_size = readl_u(&init->smt_tx_size);
+	fp->ring_smt_rx = mmio + readl_u(&init->smt_rx);
+	fp->ring_smt_rx_size = readl_u(&init->smt_rx_size);
+
+	fp->buffer_tx = mmio + FZA_TX_BUFFER_ADDR(readl_u(&init->rmc_tx));
+
+	fp->t_max = readl_u(&init->def_t_max);
+	fp->t_req = readl_u(&init->def_t_req);
+	fp->tvx = readl_u(&init->def_tvx);
+	fp->lem_threshold = readl_u(&init->lem_threshold);
+	fza_reads(&init->def_station_id, &fp->station_id,
+		  sizeof(fp->station_id));
+	fp->rtoken_timeout = readl_u(&init->rtoken_timeout);
+	fp->ring_purger = readl_u(&init->ring_purger);
+
+	smt_ver = readl_u(&init->smt_ver);
+	pmd_type = readl_u(&init->pmd_type);
+
+	pr_debug("%s: INIT parameters:\n", fp->name);
+	pr_debug("        tx_mode: %u\n", readl_u(&init->tx_mode));
+	pr_debug("    hst_rx_size: %u\n", readl_u(&init->hst_rx_size));
+	pr_debug("        rmc_rev: %.4s\n", rmc_rev);
+	pr_debug("        rom_rev: %.4s\n", rom_rev);
+	pr_debug("         fw_rev: %.4s\n", fw_rev);
+	pr_debug("       mop_type: %u\n", readl_u(&init->mop_type));
+	pr_debug("         hst_rx: 0x%08x\n", readl_u(&init->hst_rx));
+	pr_debug("         rmc_tx: 0x%08x\n", readl_u(&init->rmc_tx));
+	pr_debug("    rmc_tx_size: %u\n", readl_u(&init->rmc_tx_size));
+	pr_debug("         smt_tx: 0x%08x\n", readl_u(&init->smt_tx));
+	pr_debug("    smt_tx_size: %u\n", readl_u(&init->smt_tx_size));
+	pr_debug("         smt_rx: 0x%08x\n", readl_u(&init->smt_rx));
+	pr_debug("    smt_rx_size: %u\n", readl_u(&init->smt_rx_size));
+	/* TC systems are always LE, so don't bother swapping. */
+	pr_debug("        hw_addr: 0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+		 (readl_u(&init->hw_addr[0]) >> 0) & 0xff,
+		 (readl_u(&init->hw_addr[0]) >> 8) & 0xff,
+		 (readl_u(&init->hw_addr[0]) >> 16) & 0xff,
+		 (readl_u(&init->hw_addr[0]) >> 24) & 0xff,
+		 (readl_u(&init->hw_addr[1]) >> 0) & 0xff,
+		 (readl_u(&init->hw_addr[1]) >> 8) & 0xff,
+		 (readl_u(&init->hw_addr[1]) >> 16) & 0xff,
+		 (readl_u(&init->hw_addr[1]) >> 24) & 0xff);
+	pr_debug("      def_t_req: %u\n", readl_u(&init->def_t_req));
+	pr_debug("        def_tvx: %u\n", readl_u(&init->def_tvx));
+	pr_debug("      def_t_max: %u\n", readl_u(&init->def_t_max));
+	pr_debug("  lem_threshold: %u\n", readl_u(&init->lem_threshold));
+	/* Don't bother swapping, see above. */
+	pr_debug(" def_station_id: 0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+		 (readl_u(&init->def_station_id[0]) >> 0) & 0xff,
+		 (readl_u(&init->def_station_id[0]) >> 8) & 0xff,
+		 (readl_u(&init->def_station_id[0]) >> 16) & 0xff,
+		 (readl_u(&init->def_station_id[0]) >> 24) & 0xff,
+		 (readl_u(&init->def_station_id[1]) >> 0) & 0xff,
+		 (readl_u(&init->def_station_id[1]) >> 8) & 0xff,
+		 (readl_u(&init->def_station_id[1]) >> 16) & 0xff,
+		 (readl_u(&init->def_station_id[1]) >> 24) & 0xff);
+	pr_debug("   pmd_type_alt: %u\n", readl_u(&init->pmd_type_alt));
+	pr_debug("        smt_ver: %u\n", readl_u(&init->smt_ver));
+	pr_debug(" rtoken_timeout: %u\n", readl_u(&init->rtoken_timeout));
+	pr_debug("    ring_purger: %u\n", readl_u(&init->ring_purger));
+	pr_debug("    smt_ver_max: %u\n", readl_u(&init->smt_ver_max));
+	pr_debug("    smt_ver_min: %u\n", readl_u(&init->smt_ver_min));
+	pr_debug("       pmd_type: %u\n", readl_u(&init->pmd_type));
+
+	pr_info("%s: model %s, address %pMF\n",
+		fp->name,
+		pmd_type == FZA_PMD_TYPE_TW ?
+			"700-C (DEFZA-CA), ThinWire PMD selected" :
+			pmd_type == FZA_PMD_TYPE_STP ?
+				"700-C (DEFZA-CA), STP PMD selected" :
+				"700 (DEFZA-AA), MMF PMD",
+		dev->dev_addr);
+	pr_info("%s: ROM rev. %.4s, firmware rev. %.4s, RMC rev. %.4s, "
+		"SMT ver. %u\n", fp->name, rom_rev, fw_rev, rmc_rev, smt_ver);
+
+	/* Now that we fetched initial parameters just shut the interface
+	 * until opened.
+	 */
+	ret = fza_close(dev);
+	if (ret != 0)
+		goto err_out_irq;
+
+	/* The FZA-specific entries in the device structure. */
+	dev->netdev_ops = &netdev_ops;
+
+	ret = register_netdev(dev);
+	if (ret != 0)
+		goto err_out_irq;
+
+	pr_info("%s: registered as %s\n", fp->name, dev->name);
+	fp->name = (const char *)dev->name;
+
+	get_device(bdev);
+	return 0;
+
+err_out_irq:
+	del_timer_sync(&fp->reset_timer);
+	fza_do_shutdown(fp);
+	free_irq(dev->irq, dev);
+
+err_out_map:
+	iounmap(mmio);
+
+err_out_resource:
+	release_mem_region(start, len);
+
+err_out_kfree:
+	free_netdev(dev);
+
+	pr_err("%s: initialization failure, aborting!\n", fp->name);
+	return ret;
+}
+
+static int fza_remove(struct device *bdev)
+{
+	struct net_device *dev = dev_get_drvdata(bdev);
+	struct fza_private *fp = netdev_priv(dev);
+	struct tc_dev *tdev = to_tc_dev(bdev);
+	resource_size_t start, len;
+
+	put_device(bdev);
+
+	unregister_netdev(dev);
+
+	del_timer_sync(&fp->reset_timer);
+	fza_do_shutdown(fp);
+	free_irq(dev->irq, dev);
+
+	iounmap(fp->mmio);
+
+	start = tdev->resource.start;
+	len = tdev->resource.end - start + 1;
+	release_mem_region(start, len);
+
+	free_netdev(dev);
+
+	return 0;
+}
+
+static struct tc_device_id const fza_tc_table[] = {
+	{ "DEC     ", "PMAF-AA " },
+	{ }
+};
+MODULE_DEVICE_TABLE(tc, fza_tc_table);
+
+static struct tc_driver fza_driver = {
+	.id_table	= fza_tc_table,
+	.driver		= {
+		.name	= "defza",
+		.bus	= &tc_bus_type,
+		.probe	= fza_probe,
+		.remove	= fza_remove,
+	},
+};
+
+static int fza_init(void)
+{
+	return tc_register_driver(&fza_driver);
+}
+
+static void fza_exit(void)
+{
+	tc_unregister_driver(&fza_driver);
+}
+
+module_init(fza_init);
+module_exit(fza_exit);
diff --git a/drivers/net/fddi/defza.h b/drivers/net/fddi/defza.h
new file mode 100644
index 000000000000..b06acf32738e
--- /dev/null
+++ b/drivers/net/fddi/defza.h
@@ -0,0 +1,791 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*	FDDI network adapter driver for DEC FDDIcontroller 700/700-C devices.
+ *
+ *	Copyright (c) 2018  Maciej W. Rozycki
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	References:
+ *
+ *	Dave Sawyer & Phil Weeks & Frank Itkowsky,
+ *	"DEC FDDIcontroller 700 Port Specification",
+ *	Revision 1.1, Digital Equipment Corporation
+ */
+
+#include <linux/compiler.h>
+#include <linux/if_fddi.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+
+/* IOmem register offsets. */
+#define FZA_REG_BASE		0x100000	/* register base address */
+#define FZA_REG_RESET		0x100200	/* reset, r/w */
+#define FZA_REG_INT_EVENT	0x100400	/* interrupt event, r/w1c */
+#define FZA_REG_STATUS		0x100402	/* status, r/o */
+#define FZA_REG_INT_MASK	0x100404	/* interrupt mask, r/w */
+#define FZA_REG_CONTROL_A	0x100500	/* control A, r/w1s */
+#define FZA_REG_CONTROL_B	0x100502	/* control B, r/w */
+
+/* Reset register constants.  Bits 1:0 are r/w, others are fixed at 0. */
+#define FZA_RESET_DLU	0x0002	/* OR with INIT to blast flash memory */
+#define FZA_RESET_INIT	0x0001	/* switch into the reset state */
+#define FZA_RESET_CLR	0x0000	/* run self-test and return to work */
+
+/* Interrupt event register constants.  All bits are r/w1c. */
+#define FZA_EVENT_DLU_DONE	0x0800	/* flash memory write complete */
+#define FZA_EVENT_FLUSH_TX	0x0400	/* transmit ring flush request */
+#define FZA_EVENT_PM_PARITY_ERR	0x0200	/* onboard packet memory parity err */
+#define FZA_EVENT_HB_PARITY_ERR	0x0100	/* host bus parity error */
+#define FZA_EVENT_NXM_ERR	0x0080	/* non-existent memory access error;
+					 * also raised for unaligned and
+					 * unsupported partial-word accesses
+					 */
+#define FZA_EVENT_LINK_ST_CHG	0x0040	/* link status change */
+#define FZA_EVENT_STATE_CHG	0x0020	/* adapter state change */
+#define FZA_EVENT_UNS_POLL	0x0010	/* unsolicited event service request */
+#define FZA_EVENT_CMD_DONE	0x0008	/* command done ack */
+#define FZA_EVENT_SMT_TX_POLL	0x0004	/* SMT frame transmit request */
+#define FZA_EVENT_RX_POLL	0x0002	/* receive request (packet avail.) */
+#define FZA_EVENT_TX_DONE	0x0001	/* RMC transmit done ack */
+
+/* Status register constants.  All bits are r/o. */
+#define FZA_STATUS_DLU_SHIFT	0xc	/* down line upgrade status bits */
+#define FZA_STATUS_DLU_MASK	0x03
+#define FZA_STATUS_LINK_SHIFT	0xb	/* link status bits */
+#define FZA_STATUS_LINK_MASK	0x01
+#define FZA_STATUS_STATE_SHIFT	0x8	/* adapter state bits */
+#define FZA_STATUS_STATE_MASK	0x07
+#define FZA_STATUS_HALT_SHIFT	0x0	/* halt reason bits */
+#define FZA_STATUS_HALT_MASK	0xff
+#define FZA_STATUS_TEST_SHIFT	0x0	/* test failure bits */
+#define FZA_STATUS_TEST_MASK	0xff
+
+#define FZA_STATUS_GET_DLU(x)	(((x) >> FZA_STATUS_DLU_SHIFT) &	\
+				 FZA_STATUS_DLU_MASK)
+#define FZA_STATUS_GET_LINK(x)	(((x) >> FZA_STATUS_LINK_SHIFT) &	\
+				 FZA_STATUS_LINK_MASK)
+#define FZA_STATUS_GET_STATE(x)	(((x) >> FZA_STATUS_STATE_SHIFT) &	\
+				 FZA_STATUS_STATE_MASK)
+#define FZA_STATUS_GET_HALT(x)	(((x) >> FZA_STATUS_HALT_SHIFT) &	\
+				 FZA_STATUS_HALT_MASK)
+#define FZA_STATUS_GET_TEST(x)	(((x) >> FZA_STATUS_TEST_SHIFT) &	\
+				 FZA_STATUS_TEST_MASK)
+
+#define FZA_DLU_FAILURE		0x0	/* DLU catastrophic error; brain dead */
+#define FZA_DLU_ERROR		0x1	/* DLU error; old firmware intact */
+#define FZA_DLU_SUCCESS		0x2	/* DLU OK; new firmware loaded */
+
+#define FZA_LINK_OFF		0x0	/* link unavailable */
+#define FZA_LINK_ON		0x1	/* link available */
+
+#define FZA_STATE_RESET		0x0	/* resetting */
+#define FZA_STATE_UNINITIALIZED	0x1	/* after a reset */
+#define FZA_STATE_INITIALIZED	0x2	/* initialized */
+#define FZA_STATE_RUNNING	0x3	/* running (link active) */
+#define FZA_STATE_MAINTENANCE	0x4	/* running (link looped back) */
+#define FZA_STATE_HALTED	0x5	/* halted (error condition) */
+
+#define FZA_HALT_UNKNOWN	0x00	/* unknown reason */
+#define FZA_HALT_HOST		0x01	/* host-directed HALT */
+#define FZA_HALT_HB_PARITY	0x02	/* host bus parity error */
+#define FZA_HALT_NXM		0x03	/* adapter non-existent memory ref. */
+#define FZA_HALT_SW		0x04	/* adapter software fault */
+#define FZA_HALT_HW		0x05	/* adapter hardware fault */
+#define FZA_HALT_PC_TRACE	0x06	/* PC Trace path test */
+#define FZA_HALT_DLSW		0x07	/* data link software fault */
+#define FZA_HALT_DLHW		0x08	/* data link hardware fault */
+
+#define FZA_TEST_FATAL		0x00	/* self-test catastrophic failure */
+#define FZA_TEST_68K		0x01	/* 68000 CPU */
+#define FZA_TEST_SRAM_BWADDR	0x02	/* SRAM byte/word address */
+#define FZA_TEST_SRAM_DBUS	0x03	/* SRAM data bus */
+#define FZA_TEST_SRAM_STUCK1	0x04	/* SRAM stuck-at range 1 */
+#define FZA_TEST_SRAM_STUCK2	0x05	/* SRAM stuck-at range 2 */
+#define FZA_TEST_SRAM_COUPL1	0x06	/* SRAM coupling range 1 */
+#define FZA_TEST_SRAM_COUPL2	0x07	/* SRAM coupling */
+#define FZA_TEST_FLASH_CRC	0x08	/* Flash CRC */
+#define FZA_TEST_ROM		0x09	/* option ROM */
+#define FZA_TEST_PHY_CSR	0x0a	/* PHY CSR */
+#define FZA_TEST_MAC_BIST	0x0b	/* MAC BiST */
+#define FZA_TEST_MAC_CSR	0x0c	/* MAC CSR */
+#define FZA_TEST_MAC_ADDR_UNIQ	0x0d	/* MAC unique address */
+#define FZA_TEST_ELM_BIST	0x0e	/* ELM BiST */
+#define FZA_TEST_ELM_CSR	0x0f	/* ELM CSR */
+#define FZA_TEST_ELM_ADDR_UNIQ	0x10	/* ELM unique address */
+#define FZA_TEST_CAM		0x11	/* CAM */
+#define FZA_TEST_NIROM		0x12	/* NI ROM checksum */
+#define FZA_TEST_SC_LOOP	0x13	/* SC loopback packet */
+#define FZA_TEST_LM_LOOP	0x14	/* LM loopback packet */
+#define FZA_TEST_EB_LOOP	0x15	/* EB loopback packet */
+#define FZA_TEST_SC_LOOP_BYPS	0x16	/* SC bypass loopback packet */
+#define FZA_TEST_LM_LOOP_LOCAL	0x17	/* LM local loopback packet */
+#define FZA_TEST_EB_LOOP_LOCAL	0x18	/* EB local loopback packet */
+#define FZA_TEST_CDC_LOOP	0x19	/* CDC loopback packet */
+#define FZA_TEST_FIBER_LOOP	0x1A	/* FIBER loopback packet */
+#define FZA_TEST_CAM_MATCH_LOOP	0x1B	/* CAM match packet loopback */
+#define FZA_TEST_68K_IRQ_STUCK	0x1C	/* 68000 interrupt line stuck-at */
+#define FZA_TEST_IRQ_PRESENT	0x1D	/* interrupt present register */
+#define FZA_TEST_RMC_BIST	0x1E	/* RMC BiST */
+#define FZA_TEST_RMC_CSR	0x1F	/* RMC CSR */
+#define FZA_TEST_RMC_ADDR_UNIQ	0x20	/* RMC unique address */
+#define FZA_TEST_PM_DPATH	0x21	/* packet memory data path */
+#define FZA_TEST_PM_ADDR	0x22	/* packet memory address */
+#define FZA_TEST_RES_23		0x23	/* reserved */
+#define FZA_TEST_PM_DESC	0x24	/* packet memory descriptor */
+#define FZA_TEST_PM_OWN		0x25	/* packet memory own bit */
+#define FZA_TEST_PM_PARITY	0x26	/* packet memory parity */
+#define FZA_TEST_PM_BSWAP	0x27	/* packet memory byte swap */
+#define FZA_TEST_PM_WSWAP	0x28	/* packet memory word swap */
+#define FZA_TEST_PM_REF		0x29	/* packet memory refresh */
+#define FZA_TEST_PM_CSR		0x2A	/* PM CSR */
+#define FZA_TEST_PORT_STATUS	0x2B	/* port status register */
+#define FZA_TEST_HOST_IRQMASK	0x2C	/* host interrupt mask */
+#define FZA_TEST_TIMER_IRQ1	0x2D	/* RTOS timer */
+#define FZA_TEST_FORCE_IRQ1	0x2E	/* force RTOS IRQ1 */
+#define FZA_TEST_TIMER_IRQ5	0x2F	/* IRQ5 backoff timer */
+#define FZA_TEST_FORCE_IRQ5	0x30	/* force IRQ5 */
+#define FZA_TEST_RES_31		0x31	/* reserved */
+#define FZA_TEST_IC_PRIO	0x32	/* interrupt controller priority */
+#define FZA_TEST_PM_FULL	0x33	/* full packet memory */
+#define FZA_TEST_PMI_DMA	0x34	/* PMI DMA */
+
+/* Interrupt mask register constants.  All bits are r/w. */
+#define FZA_MASK_RESERVED	0xf000	/* unused */
+#define FZA_MASK_DLU_DONE	0x0800	/* flash memory write complete */
+#define FZA_MASK_FLUSH_TX	0x0400	/* transmit ring flush request */
+#define FZA_MASK_PM_PARITY_ERR	0x0200	/* onboard packet memory parity error
+					 */
+#define FZA_MASK_HB_PARITY_ERR	0x0100	/* host bus parity error */
+#define FZA_MASK_NXM_ERR	0x0080	/* adapter non-existent memory
+					 * reference
+					 */
+#define FZA_MASK_LINK_ST_CHG	0x0040	/* link status change */
+#define FZA_MASK_STATE_CHG	0x0020	/* adapter state change */
+#define FZA_MASK_UNS_POLL	0x0010	/* unsolicited event service request */
+#define FZA_MASK_CMD_DONE	0x0008	/* command ring entry processed */
+#define FZA_MASK_SMT_TX_POLL	0x0004	/* SMT frame transmit request */
+#define FZA_MASK_RCV_POLL	0x0002	/* receive request (packet available)
+					 */
+#define FZA_MASK_TX_DONE	0x0001	/* RMC transmit done acknowledge */
+
+/* Which interrupts to receive: 0/1 is mask/unmask. */
+#define FZA_MASK_NONE		0x0000
+#define FZA_MASK_NORMAL							\
+		((~(FZA_MASK_RESERVED | FZA_MASK_DLU_DONE |		\
+		    FZA_MASK_PM_PARITY_ERR | FZA_MASK_HB_PARITY_ERR |	\
+		    FZA_MASK_NXM_ERR)) & 0xffff)
+
+/* Control A register constants. */
+#define FZA_CONTROL_A_HB_PARITY_ERR	0x8000	/* host bus parity error */
+#define FZA_CONTROL_A_NXM_ERR		0x4000	/* adapter non-existent memory
+						 * reference
+						 */
+#define FZA_CONTROL_A_SMT_RX_OVFL	0x0040	/* SMT receive overflow */
+#define FZA_CONTROL_A_FLUSH_DONE	0x0020	/* flush tx request complete */
+#define FZA_CONTROL_A_SHUT		0x0010	/* turn the interface off */
+#define FZA_CONTROL_A_HALT		0x0008	/* halt the controller */
+#define FZA_CONTROL_A_CMD_POLL		0x0004	/* command ring poll */
+#define FZA_CONTROL_A_SMT_RX_POLL	0x0002	/* SMT receive ring poll */
+#define FZA_CONTROL_A_TX_POLL		0x0001	/* transmit poll */
+
+/* Control B register constants.  All bits are r/w.
+ *
+ * Possible values:
+ *	0x0000 after booting into REX,
+ *	0x0003 after issuing `boot #/mop'.
+ */
+#define FZA_CONTROL_B_CONSOLE	0x0002	/* OR with DRIVER for console
+					 * (TC firmware) mode
+					 */
+#define FZA_CONTROL_B_DRIVER	0x0001	/* driver mode */
+#define FZA_CONTROL_B_IDLE	0x0000	/* no driver installed */
+
+#define FZA_RESET_PAD							\
+		(FZA_REG_RESET - FZA_REG_BASE)
+#define FZA_INT_EVENT_PAD						\
+		(FZA_REG_INT_EVENT - FZA_REG_RESET - sizeof(u16))
+#define FZA_CONTROL_A_PAD						\
+		(FZA_REG_CONTROL_A - FZA_REG_INT_MASK - sizeof(u16))
+
+/* Layout of registers. */
+struct fza_regs {
+	u8  pad0[FZA_RESET_PAD];
+	u16 reset;				/* reset register */
+	u8  pad1[FZA_INT_EVENT_PAD];
+	u16 int_event;				/* interrupt event register */
+	u16 status;				/* status register */
+	u16 int_mask;				/* interrupt mask register */
+	u8  pad2[FZA_CONTROL_A_PAD];
+	u16 control_a;				/* control A register */
+	u16 control_b;				/* control B register */
+};
+
+/* Command descriptor ring entry. */
+struct fza_ring_cmd {
+	u32 cmd_own;		/* bit 31: ownership, bits [30:0]: command */
+	u32 stat;		/* command status */
+	u32 buffer;		/* address of the buffer in the FZA space */
+	u32 pad0;
+};
+
+#define FZA_RING_CMD		0x200400	/* command ring address */
+#define FZA_RING_CMD_SIZE	0x40		/* command descriptor ring
+						 * size
+/* Command constants. */
+#define FZA_RING_CMD_MASK	0x7fffffff
+#define FZA_RING_CMD_NOP	0x00000000	/* nop */
+#define FZA_RING_CMD_INIT	0x00000001	/* initialize */
+#define FZA_RING_CMD_MODCAM	0x00000002	/* modify CAM */
+#define FZA_RING_CMD_PARAM	0x00000003	/* set system parameters */
+#define FZA_RING_CMD_MODPROM	0x00000004	/* modify promiscuous mode */
+#define FZA_RING_CMD_SETCHAR	0x00000005	/* set link characteristics */
+#define FZA_RING_CMD_RDCNTR	0x00000006	/* read counters */
+#define FZA_RING_CMD_STATUS	0x00000007	/* get link status */
+#define FZA_RING_CMD_RDCAM	0x00000008	/* read CAM */
+
+/* Command status constants. */
+#define FZA_RING_STAT_SUCCESS	0x00000000
+
+/* Unsolicited event descriptor ring entry. */
+struct fza_ring_uns {
+	u32 own;		/* bit 31: ownership, bits [30:0]: reserved */
+	u32 id;			/* event ID */
+	u32 buffer;		/* address of the buffer in the FZA space */
+	u32 pad0;		/* reserved */
+};
+
+#define FZA_RING_UNS		0x200800	/* unsolicited ring address */
+#define FZA_RING_UNS_SIZE	0x40		/* unsolicited descriptor ring
+						 * size
+						 */
+/* Unsolicited event constants. */
+#define FZA_RING_UNS_UND	0x00000000	/* undefined event ID */
+#define FZA_RING_UNS_INIT_IN	0x00000001	/* ring init initiated */
+#define FZA_RING_UNS_INIT_RX	0x00000002	/* ring init received */
+#define FZA_RING_UNS_BEAC_IN	0x00000003	/* ring beaconing initiated */
+#define FZA_RING_UNS_DUP_ADDR	0x00000004	/* duplicate address detected */
+#define FZA_RING_UNS_DUP_TOK	0x00000005	/* duplicate token detected */
+#define FZA_RING_UNS_PURG_ERR	0x00000006	/* ring purger error */
+#define FZA_RING_UNS_STRIP_ERR	0x00000007	/* bridge strip error */
+#define FZA_RING_UNS_OP_OSC	0x00000008	/* ring op oscillation */
+#define FZA_RING_UNS_BEAC_RX	0x00000009	/* directed beacon received */
+#define FZA_RING_UNS_PCT_IN	0x0000000a	/* PC trace initiated */
+#define FZA_RING_UNS_PCT_RX	0x0000000b	/* PC trace received */
+#define FZA_RING_UNS_TX_UNDER	0x0000000c	/* transmit underrun */
+#define FZA_RING_UNS_TX_FAIL	0x0000000d	/* transmit failure */
+#define FZA_RING_UNS_RX_OVER	0x0000000e	/* receive overrun */
+
+/* RMC (Ring Memory Control) transmit descriptor ring entry. */
+struct fza_ring_rmc_tx {
+	u32 rmc;		/* RMC information */
+	u32 avl;		/* available for host (unused by RMC) */
+	u32 own;		/* bit 31: ownership, bits [30:0]: reserved */
+	u32 pad0;		/* reserved */
+};
+
+#define FZA_TX_BUFFER_ADDR(x)	(0x200000 | (((x) & 0xffff) << 5))
+#define FZA_TX_BUFFER_SIZE	512
+struct fza_buffer_tx {
+	u32 data[FZA_TX_BUFFER_SIZE / sizeof(u32)];
+};
+
+/* Transmit ring RMC constants. */
+#define FZA_RING_TX_SOP		0x80000000	/* start of packet */
+#define FZA_RING_TX_EOP		0x40000000	/* end of packet */
+#define FZA_RING_TX_DTP		0x20000000	/* discard this packet */
+#define FZA_RING_TX_VBC		0x10000000	/* valid buffer byte count */
+#define FZA_RING_TX_DCC_MASK	0x0f000000	/* DMA completion code */
+#define FZA_RING_TX_DCC_SUCCESS	0x01000000	/* transmit succeeded */
+#define FZA_RING_TX_DCC_DTP_SOP	0x02000000	/* DTP set at SOP */
+#define FZA_RING_TX_DCC_DTP	0x04000000	/* DTP set within packet */
+#define FZA_RING_TX_DCC_ABORT	0x05000000	/* MAC-requested abort */
+#define FZA_RING_TX_DCC_PARITY	0x06000000	/* xmit data parity error */
+#define FZA_RING_TX_DCC_UNDRRUN	0x07000000	/* transmit underrun */
+#define FZA_RING_TX_XPO_MASK	0x003fe000	/* transmit packet offset */
+
+/* Host receive descriptor ring entry. */
+struct fza_ring_hst_rx {
+	u32 buf0_own;		/* bit 31: ownership, bits [30:23]: unused,
+				 * bits [22:0]: right-shifted address of the
+				 * buffer in system memory (low buffer)
+				 */
+	u32 buffer1;		/* bits [31:23]: unused,
+				 * bits [22:0]: right-shifted address of the
+				 * buffer in system memory (high buffer)
+				 */
+	u32 rmc;		/* RMC information */
+	u32 pad0;
+};
+
+#define FZA_RX_BUFFER_SIZE	(4096 + 512)	/* buffer length */
+
+/* Receive ring RMC constants. */
+#define FZA_RING_RX_SOP		0x80000000	/* start of packet */
+#define FZA_RING_RX_EOP		0x40000000	/* end of packet */
+#define FZA_RING_RX_FSC_MASK	0x38000000	/* # of frame status bits */
+#define FZA_RING_RX_FSB_MASK	0x07c00000	/* frame status bits */
+#define FZA_RING_RX_FSB_ERR	0x04000000	/* error detected */
+#define FZA_RING_RX_FSB_ADDR	0x02000000	/* address recognized */
+#define FZA_RING_RX_FSB_COP	0x01000000	/* frame copied */
+#define FZA_RING_RX_FSB_F0	0x00800000	/* first additional flag */
+#define FZA_RING_RX_FSB_F1	0x00400000	/* second additional flag */
+#define FZA_RING_RX_BAD		0x00200000	/* bad packet */
+#define FZA_RING_RX_CRC		0x00100000	/* CRC error */
+#define FZA_RING_RX_RRR_MASK	0x000e0000	/* MAC receive status bits */
+#define FZA_RING_RX_RRR_OK	0x00000000	/* receive OK */
+#define FZA_RING_RX_RRR_SADDR	0x00020000	/* source address matched */
+#define FZA_RING_RX_RRR_DADDR	0x00040000	/* dest address not matched */
+#define FZA_RING_RX_RRR_ABORT	0x00060000	/* RMC abort */
+#define FZA_RING_RX_RRR_LENGTH	0x00080000	/* invalid length */
+#define FZA_RING_RX_RRR_FRAG	0x000a0000	/* fragment */
+#define FZA_RING_RX_RRR_FORMAT	0x000c0000	/* format error */
+#define FZA_RING_RX_RRR_RESET	0x000e0000	/* MAC reset */
+#define FZA_RING_RX_DA_MASK	0x00018000	/* daddr match status bits */
+#define FZA_RING_RX_DA_NONE	0x00000000	/* no match */
+#define FZA_RING_RX_DA_PROM	0x00008000	/* promiscuous match */
+#define FZA_RING_RX_DA_CAM	0x00010000	/* CAM entry match */
+#define FZA_RING_RX_DA_LOCAL	0x00018000	/* link addr or LLC bcast */
+#define FZA_RING_RX_SA_MASK	0x00006000	/* saddr match status bits */
+#define FZA_RING_RX_SA_NONE	0x00000000	/* no match */
+#define FZA_RING_RX_SA_ALIAS	0x00002000	/* alias address match */
+#define FZA_RING_RX_SA_CAM	0x00004000	/* CAM entry match */
+#define FZA_RING_RX_SA_LOCAL	0x00006000	/* link address match */
+
+/* SMT (Station Management) transmit/receive descriptor ring entry. */
+struct fza_ring_smt {
+	u32 own;		/* bit 31: ownership, bits [30:0]: unused */
+	u32 rmc;		/* RMC information */
+	u32 buffer;		/* address of the buffer */
+	u32 pad0;		/* reserved */
+};
+
+/* Ownership constants.
+ *
+ * Only an owner is permitted to process a given ring entry.
+ * RMC transmit ring meanings are reversed.
+ */
+#define FZA_RING_OWN_MASK	0x80000000
+#define FZA_RING_OWN_FZA	0x00000000	/* permit FZA, forbid host */
+#define FZA_RING_OWN_HOST	0x80000000	/* permit host, forbid FZA */
+#define FZA_RING_TX_OWN_RMC	0x80000000	/* permit RMC, forbid host */
+#define FZA_RING_TX_OWN_HOST	0x00000000	/* permit host, forbid RMC */
+
+/* RMC constants. */
+#define FZA_RING_PBC_MASK	0x00001fff	/* frame length */
+
+/* Layout of counter buffers. */
+
+struct fza_counter {
+	u32 msw;
+	u32 lsw;
+};
+
+struct fza_counters {
+	struct fza_counter sys_buf;	/* system buffer unavailable */
+	struct fza_counter tx_under;	/* transmit underruns */
+	struct fza_counter tx_fail;	/* transmit failures */
+	struct fza_counter rx_over;	/* receive data overruns */
+	struct fza_counter frame_cnt;	/* frame count */
+	struct fza_counter error_cnt;	/* error count */
+	struct fza_counter lost_cnt;	/* lost count */
+	struct fza_counter rinit_in;	/* ring initialization initiated */
+	struct fza_counter rinit_rx;	/* ring initialization received */
+	struct fza_counter beac_in;	/* ring beacon initiated */
+	struct fza_counter dup_addr;	/* duplicate address test failures */
+	struct fza_counter dup_tok;	/* duplicate token detected */
+	struct fza_counter purg_err;	/* ring purge errors */
+	struct fza_counter strip_err;	/* bridge strip errors */
+	struct fza_counter pct_in;	/* traces initiated */
+	struct fza_counter pct_rx;	/* traces received */
+	struct fza_counter lem_rej;	/* LEM rejects */
+	struct fza_counter tne_rej;	/* TNE expiry rejects */
+	struct fza_counter lem_event;	/* LEM events */
+	struct fza_counter lct_rej;	/* LCT rejects */
+	struct fza_counter conn_cmpl;	/* connections completed */
+	struct fza_counter el_buf;	/* elasticity buffer errors */
+};
+
+/* Layout of command buffers. */
+
+/* INIT command buffer.
+ *
+ * Values of default link parameters given are as obtained from a
+ * DEFZA-AA rev. C03 board.  The board counts time in units of 80ns.
+ */
+struct fza_cmd_init {
+	u32 tx_mode;			/* transmit mode */
+	u32 hst_rx_size;		/* host receive ring entries */
+
+	struct fza_counters counters;	/* counters */
+
+	u8 rmc_rev[4];			/* RMC revision */
+	u8 rom_rev[4];			/* ROM revision */
+	u8 fw_rev[4];			/* firmware revision */
+
+	u32 mop_type;			/* MOP device type */
+
+	u32 hst_rx;			/* base of host rx descriptor ring */
+	u32 rmc_tx;			/* base of RMC tx descriptor ring */
+	u32 rmc_tx_size;		/* size of RMC tx descriptor ring */
+	u32 smt_tx;			/* base of SMT tx descriptor ring */
+	u32 smt_tx_size;		/* size of SMT tx descriptor ring */
+	u32 smt_rx;			/* base of SMT rx descriptor ring */
+	u32 smt_rx_size;		/* size of SMT rx descriptor ring */
+
+	u32 hw_addr[2];			/* link address */
+
+	u32 def_t_req;			/* default Requested TTRT (T_REQ) --
+					 * C03: 100000 [80ns]
+					 */
+	u32 def_tvx;			/* default Valid Transmission Time
+					 * (TVX) -- C03: 32768 [80ns]
+					 */
+	u32 def_t_max;			/* default Maximum TTRT (T_MAX) --
+					 * C03: 2162688 [80ns]
+					 */
+	u32 lem_threshold;		/* default LEM threshold -- C03: 8 */
+	u32 def_station_id[2];		/* default station ID */
+
+	u32 pmd_type_alt;		/* alternative PMD type code */
+
+	u32 smt_ver;			/* SMT version */
+
+	u32 rtoken_timeout;		/* default restricted token timeout
+					 * -- C03: 12500000 [80ns]
+					 */
+	u32 ring_purger;		/* default ring purger enable --
+					 * C03: 1
+					 */
+
+	u32 smt_ver_max;		/* max SMT version ID */
+	u32 smt_ver_min;		/* min SMT version ID */
+	u32 pmd_type;			/* PMD type code */
+};
+
+/* INIT command PMD type codes. */
+#define FZA_PMD_TYPE_MMF	  0	/* Multimode fiber */
+#define FZA_PMD_TYPE_TW		101	/* ThinWire */
+#define FZA_PMD_TYPE_STP	102	/* STP */
+
+/* MODCAM/RDCAM command buffer. */
+#define FZA_CMD_CAM_SIZE	64		/* CAM address entry count */
+struct fza_cmd_cam {
+	u32 hw_addr[FZA_CMD_CAM_SIZE][2];	/* CAM address entries */
+};
+
+/* PARAM command buffer.
+ *
+ * Permitted ranges given are as defined by the spec and obtained from a
+ * DEFZA-AA rev. C03 board, respectively.  The rtoken_timeout field is
+ * erroneously interpreted in units of ms.
+ */
+struct fza_cmd_param {
+	u32 loop_mode;			/* loopback mode */
+	u32 t_max;			/* Maximum TTRT (T_MAX)
+					 * def: ??? [80ns]
+					 * C03: [t_req+1,4294967295] [80ns]
+					 */
+	u32 t_req;			/* Requested TTRT (T_REQ)
+					 * def: [50000,2097151] [80ns]
+					 * C03: [50001,t_max-1] [80ns]
+					 */
+	u32 tvx;			/* Valid Transmission Time (TVX)
+					 * def: [29375,65280] [80ns]
+					 * C03: [29376,65279] [80ns]
+					 */
+	u32 lem_threshold;		/* LEM threshold */
+	u32 station_id[2];		/* station ID */
+	u32 rtoken_timeout;		/* restricted token timeout
+					 * def: [0,125000000] [80ns]
+					 * C03: [0,9999] [ms]
+					 */
+	u32 ring_purger;		/* ring purger enable: 0|1 */
+};
+
+/* Loopback modes for the PARAM command. */
+#define FZA_LOOP_NORMAL		0
+#define FZA_LOOP_INTERN		1
+#define FZA_LOOP_EXTERN		2
+
+/* MODPROM command buffer. */
+struct fza_cmd_modprom {
+	u32 llc_prom;			/* LLC promiscuous enable */
+	u32 smt_prom;			/* SMT promiscuous enable */
+	u32 llc_multi;			/* LLC multicast promiscuous enable */
+	u32 llc_bcast;			/* LLC broadcast promiscuous enable */
+};
+
+/* SETCHAR command buffer.
+ *
+ * Permitted ranges are as for the PARAM command.
+ */
+struct fza_cmd_setchar {
+	u32 t_max;			/* Maximum TTRT (T_MAX) */
+	u32 t_req;			/* Requested TTRT (T_REQ) */
+	u32 tvx;			/* Valid Transmission Time (TVX) */
+	u32 lem_threshold;		/* LEM threshold */
+	u32 rtoken_timeout;		/* restricted token timeout */
+	u32 ring_purger;		/* ring purger enable */
+};
+
+/* RDCNTR command buffer. */
+struct fza_cmd_rdcntr {
+	struct fza_counters counters;	/* counters */
+};
+
+/* STATUS command buffer. */
+struct fza_cmd_status {
+	u32 led_state;			/* LED state */
+	u32 rmt_state;			/* ring management state */
+	u32 link_state;			/* link state */
+	u32 dup_addr;			/* duplicate address flag */
+	u32 ring_purger;		/* ring purger state */
+	u32 t_neg;			/* negotiated TTRT [80ns] */
+	u32 una[2];			/* upstream neighbour address */
+	u32 una_timeout;		/* UNA timed out */
+	u32 strip_mode;			/* frame strip mode */
+	u32 yield_mode;			/* claim token yield mode */
+	u32 phy_state;			/* PHY state */
+	u32 neigh_phy;			/* neighbour PHY type */
+	u32 reject;			/* reject reason */
+	u32 phy_lee;			/* PHY link error estimate [-log10] */
+	u32 una_old[2];			/* old upstream neighbour address */
+	u32 rmt_mac;			/* remote MAC indicated */
+	u32 ring_err;			/* ring error reason */
+	u32 beac_rx[2];			/* sender of last directed beacon */
+	u32 un_dup_addr;		/* upstream neighbr dup address flag */
+	u32 dna[2];			/* downstream neighbour address */
+	u32 dna_old[2];			/* old downstream neighbour address */
+};
+
+/* Common command buffer. */
+union fza_cmd_buf {
+	struct fza_cmd_init init;
+	struct fza_cmd_cam cam;
+	struct fza_cmd_param param;
+	struct fza_cmd_modprom modprom;
+	struct fza_cmd_setchar setchar;
+	struct fza_cmd_rdcntr rdcntr;
+	struct fza_cmd_status status;
+};
+
+/* MAC (Media Access Controller) chip packet request header constants. */
+
+/* Packet request header byte #0. */
+#define FZA_PRH0_FMT_TYPE_MASK	0xc0	/* type of packet, always zero */
+#define FZA_PRH0_TOK_TYPE_MASK	0x30	/* type of token required
+					 * to send this frame
+					 */
+#define FZA_PRH0_TKN_TYPE_ANY	0x30	/* use either token type */
+#define FZA_PRH0_TKN_TYPE_UNR	0x20	/* use an unrestricted token */
+#define FZA_PRH0_TKN_TYPE_RST	0x10	/* use a restricted token */
+#define FZA_PRH0_TKN_TYPE_IMM	0x00	/* send immediately, no token required
+					 */
+#define FZA_PRH0_FRAME_MASK	0x08	/* type of frame to send */
+#define FZA_PRH0_FRAME_SYNC	0x08	/* send a synchronous frame */
+#define FZA_PRH0_FRAME_ASYNC	0x00	/* send an asynchronous frame */
+#define FZA_PRH0_MODE_MASK	0x04	/* send mode */
+#define FZA_PRH0_MODE_IMMED	0x04	/* an immediate mode, send regardless
+					 * of the ring operational state
+					 */
+#define FZA_PRH0_MODE_NORMAL	0x00	/* a normal mode, send only if ring
+					 * operational
+					 */
+#define FZA_PRH0_SF_MASK	0x02	/* send frame first */
+#define FZA_PRH0_SF_FIRST	0x02	/* send this frame first
+					 * with this token capture
+					 */
+#define FZA_PRH0_SF_NORMAL	0x00	/* treat this frame normally */
+#define FZA_PRH0_BCN_MASK	0x01	/* beacon frame */
+#define FZA_PRH0_BCN_BEACON	0x01	/* send the frame only
+					 * if in the beacon state
+					 */
+#define FZA_PRH0_BCN_DATA	0x01	/* send the frame only
+					 * if in the data state
+					 */
+/* Packet request header byte #1. */
+					/* bit 7 always zero */
+#define FZA_PRH1_SL_MASK	0x40	/* send frame last */
+#define FZA_PRH1_SL_LAST	0x40	/* send this frame last, releasing
+					 * the token afterwards
+					 */
+#define FZA_PRH1_SL_NORMAL	0x00	/* treat this frame normally */
+#define FZA_PRH1_CRC_MASK	0x20	/* CRC append */
+#define FZA_PRH1_CRC_NORMAL	0x20	/* calculate the CRC and append it
+					 * as the FCS field to the frame
+					 */
+#define FZA_PRH1_CRC_SKIP	0x00	/* leave the frame as is */
+#define FZA_PRH1_TKN_SEND_MASK	0x18	/* type of token to send after the
+					 * frame if this is the last frame
+					 */
+#define FZA_PRH1_TKN_SEND_ORIG	0x18	/* send a token of the same type as the
+					 * originally captured one
+					 */
+#define FZA_PRH1_TKN_SEND_RST	0x10	/* send a restricted token */
+#define FZA_PRH1_TKN_SEND_UNR	0x08	/* send an unrestricted token */
+#define FZA_PRH1_TKN_SEND_NONE	0x00	/* send no token */
+#define FZA_PRH1_EXTRA_FS_MASK	0x07	/* send extra frame status indicators
+					 */
+#define FZA_PRH1_EXTRA_FS_ST	0x07	/* TR RR ST II */
+#define FZA_PRH1_EXTRA_FS_SS	0x06	/* TR RR SS II */
+#define FZA_PRH1_EXTRA_FS_SR	0x05	/* TR RR SR II */
+#define FZA_PRH1_EXTRA_FS_NONE1	0x04	/* TR RR II II */
+#define FZA_PRH1_EXTRA_FS_RT	0x03	/* TR RR RT II */
+#define FZA_PRH1_EXTRA_FS_RS	0x02	/* TR RR RS II */
+#define FZA_PRH1_EXTRA_FS_RR	0x01	/* TR RR RR II */
+#define FZA_PRH1_EXTRA_FS_NONE	0x00	/* TR RR II II */
+/* Packet request header byte #2. */
+#define FZA_PRH2_NORMAL		0x00	/* always zero */
+
+/* PRH used for LLC frames. */
+#define FZA_PRH0_LLC		(FZA_PRH0_TKN_TYPE_UNR)
+#define FZA_PRH1_LLC		(FZA_PRH1_CRC_NORMAL | FZA_PRH1_TKN_SEND_UNR)
+#define FZA_PRH2_LLC		(FZA_PRH2_NORMAL)
+
+/* PRH used for SMT frames. */
+#define FZA_PRH0_SMT		(FZA_PRH0_TKN_TYPE_UNR)
+#define FZA_PRH1_SMT		(FZA_PRH1_CRC_NORMAL | FZA_PRH1_TKN_SEND_UNR)
+#define FZA_PRH2_SMT		(FZA_PRH2_NORMAL)
+
+#if ((FZA_RING_RX_SIZE) < 2) || ((FZA_RING_RX_SIZE) > 256)
+# error FZA_RING_RX_SIZE has to be from 2 up to 256
+#endif
+#if ((FZA_RING_TX_MODE) != 0) && ((FZA_RING_TX_MODE) != 1)
+# error FZA_RING_TX_MODE has to be either 0 or 1
+#endif
+
+#define FZA_RING_TX_SIZE (512 << (FZA_RING_TX_MODE))
+
+struct fza_private {
+	struct device *bdev;		/* pointer to the bus device */
+	const char *name;		/* printable device name */
+	void __iomem *mmio;		/* MMIO ioremap cookie */
+	struct fza_regs __iomem *regs;	/* pointer to FZA registers */
+
+	struct sk_buff *rx_skbuff[FZA_RING_RX_SIZE];
+					/* all skbs assigned to the host
+					 * receive descriptors
+					 */
+	dma_addr_t rx_dma[FZA_RING_RX_SIZE];
+					/* their corresponding DMA addresses */
+
+	struct fza_ring_cmd __iomem *ring_cmd;
+					/* pointer to the command descriptor
+					 * ring
+					 */
+	int ring_cmd_index;		/* index to the command descriptor ring
+					 * for the next command
+					 */
+	struct fza_ring_uns __iomem *ring_uns;
+					/* pointer to the unsolicited
+					 * descriptor ring
+					 */
+	int ring_uns_index;		/* index to the unsolicited descriptor
+					 * ring for the next event
+					 */
+
+	struct fza_ring_rmc_tx __iomem *ring_rmc_tx;
+					/* pointer to the RMC transmit
+					 * descriptor ring (obtained from the
+					 * INIT command)
+					 */
+	int ring_rmc_tx_size;		/* number of entries in the RMC
+					 * transmit descriptor ring (obtained
+					 * from the INIT command)
+					 */
+	int ring_rmc_tx_index;		/* index to the RMC transmit descriptor
+					 * ring for the next transmission
+					 */
+	int ring_rmc_txd_index;		/* index to the RMC transmit descriptor
+					 * ring for the next transmit done
+					 * acknowledge
+					 */
+
+	struct fza_ring_hst_rx __iomem *ring_hst_rx;
+					/* pointer to the host receive
+					 * descriptor ring (obtained from the
+					 * INIT command)
+					 */
+	int ring_hst_rx_size;		/* number of entries in the host
+					 * receive descriptor ring (set by the
+					 * INIT command)
+					 */
+	int ring_hst_rx_index;		/* index to the host receive descriptor
+					 * ring for the next transmission
+					 */
+
+	struct fza_ring_smt __iomem *ring_smt_tx;
+					/* pointer to the SMT transmit
+					 * descriptor ring (obtained from the
+					 * INIT command)
+					 */
+	int ring_smt_tx_size;		/* number of entries in the SMT
+					 * transmit descriptor ring (obtained
+					 * from the INIT command)
+					 */
+	int ring_smt_tx_index;		/* index to the SMT transmit descriptor
+					 * ring for the next transmission
+					 */
+
+	struct fza_ring_smt __iomem *ring_smt_rx;
+					/* pointer to the SMT transmit
+					 * descriptor ring (obtained from the
+					 * INIT command)
+					 */
+	int ring_smt_rx_size;		/* number of entries in the SMT
+					 * receive descriptor ring (obtained
+					 * from the INIT command)
+					 */
+	int ring_smt_rx_index;		/* index to the SMT receive descriptor
+					 * ring for the next transmission
+					 */
+
+	struct fza_buffer_tx __iomem *buffer_tx;
+					/* pointer to the RMC transmit buffers
+					 */
+
+	uint state;			/* adapter expected state */
+
+	spinlock_t lock;		/* for device & private data access */
+	uint int_mask;			/* interrupt source selector */
+
+	int cmd_done_flag;		/* command completion trigger */
+	wait_queue_head_t cmd_done_wait;
+
+	int state_chg_flag;		/* state change trigger */
+	wait_queue_head_t state_chg_wait;
+
+	struct timer_list reset_timer;	/* RESET time-out trigger */
+	int timer_state;		/* RESET trigger state */
+
+	int queue_active;		/* whether to enable queueing */
+
+	struct net_device_stats stats;
+
+	uint irq_count_flush_tx;	/* transmit flush irqs */
+	uint irq_count_uns_poll;	/* unsolicited event irqs */
+	uint irq_count_smt_tx_poll;	/* SMT transmit irqs */
+	uint irq_count_rx_poll;		/* host receive irqs */
+	uint irq_count_tx_done;		/* transmit done irqs */
+	uint irq_count_cmd_done;	/* command done irqs */
+	uint irq_count_state_chg;	/* state change irqs */
+	uint irq_count_link_st_chg;	/* link status change irqs */
+
+	uint t_max;			/* T_MAX */
+	uint t_req;			/* T_REQ */
+	uint tvx;			/* TVX */
+	uint lem_threshold;		/* LEM threshold */
+	uint station_id[2];		/* station ID */
+	uint rtoken_timeout;		/* restricted token timeout */
+	uint ring_purger;		/* ring purger enable flag */
+};
+
+struct fza_fddihdr {
+	u8 pa[2];			/* preamble */
+	u8 sd;				/* starting delimiter */
+	struct fddihdr hdr;
+} __packed;
diff --git a/include/uapi/linux/if_fddi.h b/include/uapi/linux/if_fddi.h
index 75eed8b62823..7239aa9c0766 100644
--- a/include/uapi/linux/if_fddi.h
+++ b/include/uapi/linux/if_fddi.h
@@ -6,9 +6,10 @@
  *
  *		Global definitions for the ANSI FDDI interface.
  *
- * Version:	@(#)if_fddi.h	1.0.2	Sep 29 2004
+ * Version:	@(#)if_fddi.h	1.0.3	Oct  6 2018
  *
- * Author:	Lawrence V. Stefani, <stefani@lkg.dec.com>
+ * Author:	Lawrence V. Stefani, <stefani@yahoo.com>
+ * Maintainer:	Maciej W. Rozycki, <macro@linux-mips.org>
  *
  *		if_fddi.h is based on previous if_ether.h and if_tr.h work by
  *			Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -45,7 +46,21 @@
 #define FDDI_K_OUI_LEN		3	/* Octets in OUI in 802.2 SNAP
 					   header */
 
-/* Define FDDI Frame Control (FC) Byte values */
+/* Define FDDI Frame Control (FC) Byte masks */
+#define FDDI_FC_K_CLASS_MASK		0x80	/* class bit */
+#define FDDI_FC_K_CLASS_SYNC		0x80
+#define FDDI_FC_K_CLASS_ASYNC		0x00
+#define FDDI_FC_K_ALEN_MASK		0x40	/* address length bit */
+#define FDDI_FC_K_ALEN_48		0x40
+#define FDDI_FC_K_ALEN_16		0x00
+#define FDDI_FC_K_FORMAT_MASK		0x30	/* format bits */
+#define FDDI_FC_K_FORMAT_FUTURE		0x30
+#define FDDI_FC_K_FORMAT_IMPLEMENTOR	0x20
+#define FDDI_FC_K_FORMAT_LLC		0x10
+#define FDDI_FC_K_FORMAT_MANAGEMENT	0x00
+#define FDDI_FC_K_CONTROL_MASK		0x0f	/* control bits */
+
+/* Define FDDI Frame Control (FC) Byte specific values */
 #define FDDI_FC_K_VOID			0x00
 #define FDDI_FC_K_NON_RESTRICTED_TOKEN	0x80
 #define FDDI_FC_K_RESTRICTED_TOKEN	0xC0
-- 
cgit v1.2.3


From 9f9a742db40f95f4dc20fc7293de4ea6ddb24e47 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 9 Oct 2018 23:57:49 +0100
Subject: FDDI: defza: Support capturing outgoing SMT traffic

DEC FDDIcontroller 700 (DEFZA) uses a Tx/Rx queue pair to communicate
SMT frames with adapter's firmware.  Any SMT frame received from the RMC
via the Rx queue is queued back by the driver to the SMT Rx queue for
the firmware to process.  Similarly the firmware uses the SMT Tx queue
to supply the driver with SMT frames which are queued back to the Tx
queue for the RMC to send to the ring.

When a network tap is attached to an FDDI interface handled by `defza'
any incoming SMT frames captured are queued to our usual processing of
network data received, which in turn delivers them to any listening
taps.

However the outgoing SMT frames produced by the firmware bypass our
network protocol stack and are therefore not delivered to taps.  This in
turn means that taps are missing a part of network traffic sent by the
adapter, which may make it more difficult to track down network problems
or do general traffic analysis.

Call `dev_queue_xmit_nit' then in the SMT Tx path, having checked that
a network tap is attached, with a newly-created `dev_nit_active' helper
wrapping the usual condition used in the transmit path.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/fddi/defza.c  | 33 +++++++++++++++++++++++++++++++--
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 13 ++++++++++++-
 3 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/fddi/defza.c b/drivers/net/fddi/defza.c
index 7d01b70f7ed8..3b7f10a5f06a 100644
--- a/drivers/net/fddi/defza.c
+++ b/drivers/net/fddi/defza.c
@@ -797,11 +797,40 @@ static void fza_tx_smt(struct net_device *dev)
 		smt_tx_ptr = fp->mmio + readl_u(&fp->ring_smt_tx[i].buffer);
 		len = readl_u(&fp->ring_smt_tx[i].rmc) & FZA_RING_PBC_MASK;
 
-		/* Queue the frame to the RMC transmit ring. */
-		if (!netif_queue_stopped(dev))
+		if (!netif_queue_stopped(dev)) {
+			if (dev_nit_active(dev)) {
+				struct sk_buff *skb;
+
+				/* Length must be a multiple of 4 as only word
+				 * reads are permitted!
+				 */
+				skb = fza_alloc_skb_irq(dev, (len + 3) & ~3);
+				if (!skb)
+					goto err_no_skb;	/* Drop. */
+
+				skb_data_ptr = (struct fza_buffer_tx *)
+					       skb->data;
+
+				fza_reads(smt_tx_ptr, skb_data_ptr,
+					  (len + 3) & ~3);
+				skb->dev = dev;
+				skb_reserve(skb, 3);	/* Skip over PRH. */
+				skb_put(skb, len - 3);
+				skb_reset_network_header(skb);
+
+				dev_queue_xmit_nit(skb, dev);
+
+				dev_kfree_skb_irq(skb);
+
+err_no_skb:
+				;
+			}
+
+			/* Queue the frame to the RMC transmit ring. */
 			fza_do_xmit((union fza_buffer_txp)
 				    { .mmio_ptr = smt_tx_ptr },
 				    len, dev, 1);
+		}
 
 		writel_o(FZA_RING_OWN_FZA, &fp->ring_smt_tx[i].own);
 		fp->ring_smt_tx_index =
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 22e4ef7bb701..dc1d9ed33b31 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3645,6 +3645,7 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev,
 	return 0;
 }
 
+bool dev_nit_active(struct net_device *dev);
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
 
 extern int		netdev_budget;
diff --git a/net/core/dev.c b/net/core/dev.c
index a4d39b87b4e5..8497feea8fb5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1976,6 +1976,17 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 	return false;
 }
 
+/**
+ * dev_nit_active - return true if any network interface taps are in use
+ *
+ * @dev: network device to check for the presence of taps
+ */
+bool dev_nit_active(struct net_device *dev)
+{
+	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
+}
+EXPORT_SYMBOL_GPL(dev_nit_active);
+
 /*
  *	Support routine. Sends outgoing frames to any network
  *	taps currently in use.
@@ -3233,7 +3244,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 	unsigned int len;
 	int rc;
 
-	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
+	if (dev_nit_active(dev))
 		dev_queue_xmit_nit(skb, dev);
 
 	len = skb->len;
-- 
cgit v1.2.3


From 9771b8ccdfa6dcb1ac5128ca7fe8649f3092d392 Mon Sep 17 00:00:00 2001
From: "Justin.Lee1@Dell.com" <Justin.Lee1@Dell.com>
Date: Thu, 11 Oct 2018 18:07:37 +0000
Subject: net/ncsi: Extend NC-SI Netlink interface to allow user space to send
 NC-SI command

The new command (NCSI_CMD_SEND_CMD) is added to allow user space application
to send NC-SI command to the network card.
Also, add a new attribute (NCSI_ATTR_DATA) for transferring request and response.

The work flow is as below.

Request:
User space application
	-> Netlink interface (msg)
	-> new Netlink handler - ncsi_send_cmd_nl()
	-> ncsi_xmit_cmd()

Response:
Response received - ncsi_rcv_rsp()
	-> internal response handler - ncsi_rsp_handler_xxx()
	-> ncsi_rsp_handler_netlink()
	-> ncsi_send_netlink_rsp ()
	-> Netlink interface (msg)
	-> user space application

Command timeout - ncsi_request_timeout()
	-> ncsi_send_netlink_timeout ()
	-> Netlink interface (msg with zero data length)
	-> user space application

Error:
Error detected
	-> ncsi_send_netlink_err ()
	-> Netlink interface (err msg)
	-> user space application

Signed-off-by: Justin Lee <justin.lee1@dell.com>
Reviewed-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ncsi.h |   6 ++
 net/ncsi/internal.h       |   7 ++
 net/ncsi/ncsi-cmd.c       |   8 ++
 net/ncsi/ncsi-manage.c    |  16 ++++
 net/ncsi/ncsi-netlink.c   | 204 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ncsi/ncsi-netlink.h   |  12 +++
 net/ncsi/ncsi-rsp.c       |  67 +++++++++++++--
 7 files changed, 315 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h
index 4c292ecbb748..0a26a5576645 100644
--- a/include/uapi/linux/ncsi.h
+++ b/include/uapi/linux/ncsi.h
@@ -23,6 +23,9 @@
  *	optionally the preferred NCSI_ATTR_CHANNEL_ID.
  * @NCSI_CMD_CLEAR_INTERFACE: clear any preferred package/channel combination.
  *	Requires NCSI_ATTR_IFINDEX.
+ * @NCSI_CMD_SEND_CMD: send NC-SI command to network card.
+ *	Requires NCSI_ATTR_IFINDEX, NCSI_ATTR_PACKAGE_ID
+ *	and NCSI_ATTR_CHANNEL_ID.
  * @NCSI_CMD_MAX: highest command number
  */
 enum ncsi_nl_commands {
@@ -30,6 +33,7 @@ enum ncsi_nl_commands {
 	NCSI_CMD_PKG_INFO,
 	NCSI_CMD_SET_INTERFACE,
 	NCSI_CMD_CLEAR_INTERFACE,
+	NCSI_CMD_SEND_CMD,
 
 	__NCSI_CMD_AFTER_LAST,
 	NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1
@@ -43,6 +47,7 @@ enum ncsi_nl_commands {
  * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes
  * @NCSI_ATTR_PACKAGE_ID: package ID
  * @NCSI_ATTR_CHANNEL_ID: channel ID
+ * @NCSI_ATTR_DATA: command payload
  * @NCSI_ATTR_MAX: highest attribute number
  */
 enum ncsi_nl_attrs {
@@ -51,6 +56,7 @@ enum ncsi_nl_attrs {
 	NCSI_ATTR_PACKAGE_LIST,
 	NCSI_ATTR_PACKAGE_ID,
 	NCSI_ATTR_CHANNEL_ID,
+	NCSI_ATTR_DATA,
 
 	__NCSI_ATTR_AFTER_LAST,
 	NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 3d0a33b874f5..13c9b5eeb3b7 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -175,6 +175,8 @@ struct ncsi_package;
 #define NCSI_RESERVED_CHANNEL	0x1f
 #define NCSI_CHANNEL_INDEX(c)	((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1))
 #define NCSI_TO_CHANNEL(p, c)	(((p) << NCSI_PACKAGE_SHIFT) | (c))
+#define NCSI_MAX_PACKAGE	8
+#define NCSI_MAX_CHANNEL	32
 
 struct ncsi_channel {
 	unsigned char               id;
@@ -220,11 +222,15 @@ struct ncsi_request {
 	bool                 used;    /* Request that has been assigned  */
 	unsigned int         flags;   /* NCSI request property           */
 #define NCSI_REQ_FLAG_EVENT_DRIVEN	1
+#define NCSI_REQ_FLAG_NETLINK_DRIVEN	2
 	struct ncsi_dev_priv *ndp;    /* Associated NCSI device          */
 	struct sk_buff       *cmd;    /* Associated NCSI command packet  */
 	struct sk_buff       *rsp;    /* Associated NCSI response packet */
 	struct timer_list    timer;   /* Timer on waiting for response   */
 	bool                 enabled; /* Time has been enabled or not    */
+	u32                  snd_seq;     /* netlink sending sequence number */
+	u32                  snd_portid;  /* netlink portid of sender        */
+	struct nlmsghdr      nlhdr;       /* netlink message header          */
 };
 
 enum {
@@ -310,6 +316,7 @@ struct ncsi_cmd_arg {
 		unsigned int   dwords[4];
 	};
 	unsigned char        *data;       /* NCSI OEM data                 */
+	struct genl_info     *info;       /* Netlink information           */
 };
 
 extern struct list_head ncsi_dev_list;
diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c
index 82b7d9201db8..356af474e43c 100644
--- a/net/ncsi/ncsi-cmd.c
+++ b/net/ncsi/ncsi-cmd.c
@@ -17,6 +17,7 @@
 #include <net/ncsi.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
+#include <net/genetlink.h>
 
 #include "internal.h"
 #include "ncsi-pkt.h"
@@ -346,6 +347,13 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca)
 	if (!nr)
 		return -ENOMEM;
 
+	/* track netlink information */
+	if (nca->req_flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+		nr->snd_seq = nca->info->snd_seq;
+		nr->snd_portid = nca->info->snd_portid;
+		nr->nlhdr = *nca->info->nlhdr;
+	}
+
 	/* Prepare the packet */
 	nca->id = nr->id;
 	ret = nch->handler(nr->cmd, nca);
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 091284760d21..6aa0614d2d28 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -19,6 +19,7 @@
 #include <net/addrconf.h>
 #include <net/ipv6.h>
 #include <net/if_inet6.h>
+#include <net/genetlink.h>
 
 #include "internal.h"
 #include "ncsi-pkt.h"
@@ -406,6 +407,9 @@ static void ncsi_request_timeout(struct timer_list *t)
 {
 	struct ncsi_request *nr = from_timer(nr, t, timer);
 	struct ncsi_dev_priv *ndp = nr->ndp;
+	struct ncsi_cmd_pkt *cmd;
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
 	unsigned long flags;
 
 	/* If the request already had associated response,
@@ -419,6 +423,18 @@ static void ncsi_request_timeout(struct timer_list *t)
 	}
 	spin_unlock_irqrestore(&ndp->lock, flags);
 
+	if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+		if (nr->cmd) {
+			/* Find the package */
+			cmd = (struct ncsi_cmd_pkt *)
+			      skb_network_header(nr->cmd);
+			ncsi_find_package_and_channel(ndp,
+						      cmd->cmd.common.channel,
+						      &np, &nc);
+			ncsi_send_netlink_timeout(nr, np, nc);
+		}
+	}
+
 	/* Release the request */
 	ncsi_free_request(nr);
 }
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 32cb7751d216..33314381b4f5 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -19,6 +19,7 @@
 #include <uapi/linux/ncsi.h>
 
 #include "internal.h"
+#include "ncsi-pkt.h"
 #include "ncsi-netlink.h"
 
 static struct genl_family ncsi_genl_family;
@@ -28,6 +29,7 @@ static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = {
 	[NCSI_ATTR_PACKAGE_LIST] =	{ .type = NLA_NESTED },
 	[NCSI_ATTR_PACKAGE_ID] =	{ .type = NLA_U32 },
 	[NCSI_ATTR_CHANNEL_ID] =	{ .type = NLA_U32 },
+	[NCSI_ATTR_DATA] =		{ .type = NLA_BINARY, .len = 2048 },
 };
 
 static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex)
@@ -365,6 +367,202 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
 	return 0;
 }
 
+static int ncsi_send_cmd_nl(struct sk_buff *msg, struct genl_info *info)
+{
+	struct ncsi_dev_priv *ndp;
+	struct ncsi_pkt_hdr *hdr;
+	struct ncsi_cmd_arg nca;
+	unsigned char *data;
+	u32 package_id;
+	u32 channel_id;
+	int len, ret;
+
+	if (!info || !info->attrs) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!info->attrs[NCSI_ATTR_IFINDEX]) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!info->attrs[NCSI_ATTR_DATA]) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
+			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
+	if (!ndp) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
+	channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
+
+	if (package_id >= NCSI_MAX_PACKAGE || channel_id >= NCSI_MAX_CHANNEL) {
+		ret = -ERANGE;
+		goto out_netlink;
+	}
+
+	len = nla_len(info->attrs[NCSI_ATTR_DATA]);
+	if (len < sizeof(struct ncsi_pkt_hdr)) {
+		netdev_info(ndp->ndev.dev, "NCSI: no command to send %u\n",
+			    package_id);
+		ret = -EINVAL;
+		goto out_netlink;
+	} else {
+		data = (unsigned char *)nla_data(info->attrs[NCSI_ATTR_DATA]);
+	}
+
+	hdr = (struct ncsi_pkt_hdr *)data;
+
+	nca.ndp = ndp;
+	nca.package = (unsigned char)package_id;
+	nca.channel = (unsigned char)channel_id;
+	nca.type = hdr->type;
+	nca.req_flags = NCSI_REQ_FLAG_NETLINK_DRIVEN;
+	nca.info = info;
+	nca.payload = ntohs(hdr->length);
+	nca.data = data + sizeof(*hdr);
+
+	ret = ncsi_xmit_cmd(&nca);
+out_netlink:
+	if (ret != 0) {
+		netdev_err(ndp->ndev.dev,
+			   "NCSI: Error %d sending command\n",
+			   ret);
+		ncsi_send_netlink_err(ndp->ndev.dev,
+				      info->snd_seq,
+				      info->snd_portid,
+				      info->nlhdr,
+				      ret);
+	}
+out:
+	return ret;
+}
+
+int ncsi_send_netlink_rsp(struct ncsi_request *nr,
+			  struct ncsi_package *np,
+			  struct ncsi_channel *nc)
+{
+	struct sk_buff *skb;
+	struct net *net;
+	void *hdr;
+	int rc;
+
+	net = dev_net(nr->rsp->dev);
+
+	skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq,
+			  &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD);
+	if (!hdr) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->rsp->dev->ifindex);
+	if (np)
+		nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id);
+	if (nc)
+		nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id);
+	else
+		nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL);
+
+	rc = nla_put(skb, NCSI_ATTR_DATA, nr->rsp->len, (void *)nr->rsp->data);
+	if (rc)
+		goto err;
+
+	genlmsg_end(skb, hdr);
+	return genlmsg_unicast(net, skb, nr->snd_portid);
+
+err:
+	kfree_skb(skb);
+	return rc;
+}
+
+int ncsi_send_netlink_timeout(struct ncsi_request *nr,
+			      struct ncsi_package *np,
+			      struct ncsi_channel *nc)
+{
+	struct sk_buff *skb;
+	struct net *net;
+	void *hdr;
+
+	skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq,
+			  &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD);
+	if (!hdr) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	net = dev_net(nr->cmd->dev);
+
+	nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->cmd->dev->ifindex);
+
+	if (np)
+		nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id);
+	else
+		nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID,
+			    NCSI_PACKAGE_INDEX((((struct ncsi_pkt_hdr *)
+						 nr->cmd->data)->channel)));
+
+	if (nc)
+		nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id);
+	else
+		nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL);
+
+	genlmsg_end(skb, hdr);
+	return genlmsg_unicast(net, skb, nr->snd_portid);
+}
+
+int ncsi_send_netlink_err(struct net_device *dev,
+			  u32 snd_seq,
+			  u32 snd_portid,
+			  struct nlmsghdr *nlhdr,
+			  int err)
+{
+	struct nlmsghdr *nlh;
+	struct nlmsgerr *nle;
+	struct sk_buff *skb;
+	struct net *net;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	net = dev_net(dev);
+
+	nlh = nlmsg_put(skb, snd_portid, snd_seq,
+			NLMSG_ERROR, sizeof(*nle), 0);
+	nle = (struct nlmsgerr *)nlmsg_data(nlh);
+	nle->error = err;
+	memcpy(&nle->msg, nlhdr, sizeof(*nlh));
+
+	nlmsg_end(skb, nlh);
+
+	return nlmsg_unicast(net->genl_sock, skb, snd_portid);
+}
+
 static const struct genl_ops ncsi_ops[] = {
 	{
 		.cmd = NCSI_CMD_PKG_INFO,
@@ -385,6 +583,12 @@ static const struct genl_ops ncsi_ops[] = {
 		.doit = ncsi_clear_interface_nl,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NCSI_CMD_SEND_CMD,
+		.policy = ncsi_genl_policy,
+		.doit = ncsi_send_cmd_nl,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family ncsi_genl_family __ro_after_init = {
diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h
index 91a5c256f8c4..c4a46887a932 100644
--- a/net/ncsi/ncsi-netlink.h
+++ b/net/ncsi/ncsi-netlink.h
@@ -14,6 +14,18 @@
 
 #include "internal.h"
 
+int ncsi_send_netlink_rsp(struct ncsi_request *nr,
+			  struct ncsi_package *np,
+			  struct ncsi_channel *nc);
+int ncsi_send_netlink_timeout(struct ncsi_request *nr,
+			      struct ncsi_package *np,
+			      struct ncsi_channel *nc);
+int ncsi_send_netlink_err(struct net_device *dev,
+			  u32 snd_seq,
+			  u32 snd_portid,
+			  struct nlmsghdr *nlhdr,
+			  int err);
+
 int ncsi_init_netlink(struct net_device *dev);
 int ncsi_unregister_netlink(struct net_device *dev);
 
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index d66b34749027..85fa59afae34 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -16,9 +16,11 @@
 #include <net/ncsi.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
+#include <net/genetlink.h>
 
 #include "internal.h"
 #include "ncsi-pkt.h"
+#include "ncsi-netlink.h"
 
 static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
 				 unsigned short payload)
@@ -32,15 +34,25 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
 	 * before calling this function.
 	 */
 	h = (struct ncsi_rsp_pkt_hdr *)skb_network_header(nr->rsp);
-	if (h->common.revision != NCSI_PKT_REVISION)
+
+	if (h->common.revision != NCSI_PKT_REVISION) {
+		netdev_dbg(nr->ndp->ndev.dev,
+			   "NCSI: unsupported header revision\n");
 		return -EINVAL;
-	if (ntohs(h->common.length) != payload)
+	}
+	if (ntohs(h->common.length) != payload) {
+		netdev_dbg(nr->ndp->ndev.dev,
+			   "NCSI: payload length mismatched\n");
 		return -EINVAL;
+	}
 
 	/* Check on code and reason */
 	if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED ||
-	    ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR)
-		return -EINVAL;
+	    ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) {
+		netdev_dbg(nr->ndp->ndev.dev,
+			   "NCSI: non zero response/reason code\n");
+		return -EPERM;
+	}
 
 	/* Validate checksum, which might be zeroes if the
 	 * sender doesn't support checksum according to NCSI
@@ -52,8 +64,11 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
 
 	checksum = ncsi_calculate_checksum((unsigned char *)h,
 					   sizeof(*h) + payload - 4);
-	if (*pchecksum != htonl(checksum))
+
+	if (*pchecksum != htonl(checksum)) {
+		netdev_dbg(nr->ndp->ndev.dev, "NCSI: checksum mismatched\n");
 		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -941,6 +956,26 @@ static int ncsi_rsp_handler_gpuuid(struct ncsi_request *nr)
 	return 0;
 }
 
+static int ncsi_rsp_handler_netlink(struct ncsi_request *nr)
+{
+	struct ncsi_dev_priv *ndp = nr->ndp;
+	struct ncsi_rsp_pkt *rsp;
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
+	int ret;
+
+	/* Find the package */
+	rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+	ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+				      &np, &nc);
+	if (!np)
+		return -ENODEV;
+
+	ret = ncsi_send_netlink_rsp(nr, np, nc);
+
+	return ret;
+}
+
 static struct ncsi_rsp_handler {
 	unsigned char	type;
 	int             payload;
@@ -1043,6 +1078,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
 		netdev_warn(ndp->ndev.dev,
 			    "NCSI: 'bad' packet ignored for type 0x%x\n",
 			    hdr->type);
+
+		if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+			if (ret == -EPERM)
+				goto out_netlink;
+			else
+				ncsi_send_netlink_err(ndp->ndev.dev,
+						      nr->snd_seq,
+						      nr->snd_portid,
+						      &nr->nlhdr,
+						      ret);
+		}
 		goto out;
 	}
 
@@ -1052,6 +1098,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
 		netdev_err(ndp->ndev.dev,
 			   "NCSI: Handler for packet type 0x%x returned %d\n",
 			   hdr->type, ret);
+
+out_netlink:
+	if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+		ret = ncsi_rsp_handler_netlink(nr);
+		if (ret) {
+			netdev_err(ndp->ndev.dev,
+				   "NCSI: Netlink handler for packet type 0x%x returned %d\n",
+				   hdr->type, ret);
+		}
+	}
+
 out:
 	ncsi_free_request(nr);
 	return ret;
-- 
cgit v1.2.3


From 5f6188a8003d080e3753b8f14f4a5a2325ae1ff6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Oct 2018 09:37:52 -0700
Subject: tcp: do not change tcp_wstamp_ns in tcp_mstamp_refresh

In EDT design, I made the mistake of using tcp_wstamp_ns
to store the last tcp_clock_ns() sample and to store the
pacing virtual timer.

This causes major regressions at high speed flows.

Introduce tcp_clock_cache to store last tcp_clock_ns().
This is needed because some arches have slow high-resolution
kernel time service.

tcp_wstamp_ns is only updated when a packet is sent.

Note that we can remove tcp_mstamp in the future since
tcp_mstamp is essentially tcp_clock_cache/1000, so the
apparent socket size increase is temporary.

Fixes: 9799ccb0e984 ("tcp: add tcp_wstamp_ns socket field")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   | 1 +
 net/ipv4/tcp_output.c | 9 ++++++---
 net/ipv4/tcp_timer.c  | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 848f5b25e178..8ed77bb4ed86 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -249,6 +249,7 @@ struct tcp_sock {
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
 	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
+	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
 
 /* RTT measurement */
 	u64	tcp_mstamp;	/* most recent packet received/sent */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 059b67af28b1..f14df66a0c85 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -52,9 +52,8 @@ void tcp_mstamp_refresh(struct tcp_sock *tp)
 {
 	u64 val = tcp_clock_ns();
 
-	/* departure time for next data packet */
-	if (val > tp->tcp_wstamp_ns)
-		tp->tcp_wstamp_ns = val;
+	if (val > tp->tcp_clock_cache)
+		tp->tcp_clock_cache = val;
 
 	val = div_u64(val, NSEC_PER_USEC);
 	if (val > tp->tcp_mstamp)
@@ -1050,6 +1049,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		if (unlikely(!skb))
 			return -ENOBUFS;
 	}
+
+	/* TODO: might take care of jitter here */
+	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
+
 	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
 
 	inet = inet_sk(sk);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 61023d50cd60..676020663ce8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk)
 	 */
 	start_ts = tcp_skb_timestamp(skb);
 	if (!start_ts)
-		skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
+		skb->skb_mstamp_ns = tp->tcp_clock_cache;
 	else if (icsk->icsk_user_timeout &&
 		 (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
 		goto abort;
-- 
cgit v1.2.3


From 76a9ebe811fb3d0605cb084f1ae6be5610541865 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Oct 2018 09:37:53 -0700
Subject: net: extend sk_pacing_rate to unsigned long

sk_pacing_rate has beed introduced as a u32 field in 2013,
effectively limiting per flow pacing to 34Gbit.

We believe it is time to allow TCP to pace high speed flows
on 64bit hosts, as we now can reach 100Gbit on one TCP flow.

This patch adds no cost for 32bit kernels.

The tcpi_pacing_rate and tcpi_max_pacing_rate were already
exported as 64bit, so iproute2/ss command require no changes.

Unfortunately the SO_MAX_PACING_RATE socket option will stay
32bit and we will need to add a new option to let applications
control high pacing rates.

State      Recv-Q Send-Q Local Address:Port             Peer Address:Port
ESTAB      0      1787144  10.246.9.76:49992             10.246.9.77:36741
                 timer:(on,003ms,0) ino:91863 sk:2 <->
 skmem:(r0,rb540000,t66440,tb2363904,f605944,w1822984,o0,bl0,d0)
 ts sack bbr wscale:8,8 rto:201 rtt:0.057/0.006 mss:1448
 rcvmss:536 advmss:1448
 cwnd:138 ssthresh:178 bytes_acked:256699822585 segs_out:177279177
 segs_in:3916318 data_segs_out:177279175
 bbr:(bw:31276.8Mbps,mrtt:0,pacing_gain:1.25,cwnd_gain:2)
 send 28045.5Mbps lastrcv:73333
 pacing_rate 38705.0Mbps delivery_rate 22997.6Mbps
 busy:73333ms unacked:135 retrans:0/157 rcv_space:14480
 notsent:2085120 minrtt:0.013

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h    |  4 ++--
 net/core/filter.c     |  4 ++--
 net/core/sock.c       |  9 +++++----
 net/ipv4/tcp.c        | 10 +++++-----
 net/ipv4/tcp_bbr.c    |  6 +++---
 net/ipv4/tcp_output.c | 19 +++++++++++--------
 net/sched/sch_fq.c    | 20 ++++++++++++--------
 7 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 751549ac0a84..cfaf261936c8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -422,8 +422,8 @@ struct sock {
 	struct timer_list	sk_timer;
 	__u32			sk_priority;
 	__u32			sk_mark;
-	u32			sk_pacing_rate; /* bytes per second */
-	u32			sk_max_pacing_rate;
+	unsigned long		sk_pacing_rate; /* bytes per second */
+	unsigned long		sk_max_pacing_rate;
 	struct page_frag	sk_frag;
 	netdev_features_t	sk_route_caps;
 	netdev_features_t	sk_route_nocaps;
diff --git a/net/core/filter.c b/net/core/filter.c
index 4bbc6567fcb8..80da21b097b8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3927,8 +3927,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 			sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 			break;
-		case SO_MAX_PACING_RATE:
-			sk->sk_max_pacing_rate = val;
+		case SO_MAX_PACING_RATE: /* 32bit version */
+			sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
 			sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 						 sk->sk_max_pacing_rate);
 			break;
diff --git a/net/core/sock.c b/net/core/sock.c
index 7e8796a6a089..fdf9fc7d3f98 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -998,7 +998,7 @@ set_rcvbuf:
 			cmpxchg(&sk->sk_pacing_status,
 				SK_PACING_NONE,
 				SK_PACING_NEEDED);
-		sk->sk_max_pacing_rate = val;
+		sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 					 sk->sk_max_pacing_rate);
 		break;
@@ -1336,7 +1336,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 #endif
 
 	case SO_MAX_PACING_RATE:
-		v.val = sk->sk_max_pacing_rate;
+		/* 32bit version */
+		v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
 		break;
 
 	case SO_INCOMING_CPU:
@@ -2810,8 +2811,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_ll_usec		=	sysctl_net_busy_read;
 #endif
 
-	sk->sk_max_pacing_rate = ~0U;
-	sk->sk_pacing_rate = ~0U;
+	sk->sk_max_pacing_rate = ~0UL;
+	sk->sk_pacing_rate = ~0UL;
 	sk->sk_pacing_shift = 10;
 	sk->sk_incoming_cpu = -1;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 43ef83b2330e..b8ba8fa34eff 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3111,10 +3111,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
 	const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
 	const struct inet_connection_sock *icsk = inet_csk(sk);
+	unsigned long rate;
 	u32 now;
 	u64 rate64;
 	bool slow;
-	u32 rate;
 
 	memset(info, 0, sizeof(*info));
 	if (sk->sk_type != SOCK_STREAM)
@@ -3124,11 +3124,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 	/* Report meaningful fields for all TCP states, including listeners */
 	rate = READ_ONCE(sk->sk_pacing_rate);
-	rate64 = rate != ~0U ? rate : ~0ULL;
+	rate64 = (rate != ~0UL) ? rate : ~0ULL;
 	info->tcpi_pacing_rate = rate64;
 
 	rate = READ_ONCE(sk->sk_max_pacing_rate);
-	rate64 = rate != ~0U ? rate : ~0ULL;
+	rate64 = (rate != ~0UL) ? rate : ~0ULL;
 	info->tcpi_max_pacing_rate = rate64;
 
 	info->tcpi_reordering = tp->reordering;
@@ -3254,8 +3254,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *stats;
 	struct tcp_info info;
+	unsigned long rate;
 	u64 rate64;
-	u32 rate;
 
 	stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
 	if (!stats)
@@ -3274,7 +3274,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 			  tp->total_retrans, TCP_NLA_PAD);
 
 	rate = READ_ONCE(sk->sk_pacing_rate);
-	rate64 = rate != ~0U ? rate : ~0ULL;
+	rate64 = (rate != ~0UL) ? rate : ~0ULL;
 	nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
 
 	rate64 = tcp_compute_delivery_rate(tp);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index a5786e3e2c16..33f4358615e6 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -219,7 +219,7 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
 }
 
 /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
-static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
 {
 	u64 rate = bw;
 
@@ -258,7 +258,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
-	u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
+	unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
 
 	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
 		bbr_init_pacing_rate_from_rtt(sk);
@@ -280,7 +280,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
 	/* Sort of tcp_tso_autosize() but ignoring
 	 * driver provided sk_gso_max_size.
 	 */
-	bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift,
+	bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift,
 		      GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
 	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f14df66a0c85..f4aa4109334a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -991,14 +991,14 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb)
 
 	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
 	if (sk->sk_pacing_status != SK_PACING_NONE) {
-		u32 rate = sk->sk_pacing_rate;
+		unsigned long rate = sk->sk_pacing_rate;
 
 		/* Original sch_fq does not pace first 10 MSS
 		 * Note that tp->data_segs_out overflows after 2^32 packets,
 		 * this is a minor annoyance.
 		 */
-		if (rate != ~0U && rate && tp->data_segs_out >= 10) {
-			tp->tcp_wstamp_ns += div_u64((u64)skb->len * NSEC_PER_SEC, rate);
+		if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
+			tp->tcp_wstamp_ns += div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
 
 			tcp_internal_pacing(sk);
 		}
@@ -1704,8 +1704,9 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 {
 	u32 bytes, segs;
 
-	bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
-		    sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+	bytes = min_t(unsigned long,
+		      sk->sk_pacing_rate >> sk->sk_pacing_shift,
+		      sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
 
 	/* Goal is to send at least one packet per ms,
 	 * not one big TSO packet every 100 ms.
@@ -2198,10 +2199,12 @@ static bool tcp_pacing_check(const struct sock *sk)
 static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 				  unsigned int factor)
 {
-	unsigned int limit;
+	unsigned long limit;
 
-	limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
-	limit = min_t(u32, limit,
+	limit = max_t(unsigned long,
+		      2 * skb->truesize,
+		      sk->sk_pacing_rate >> sk->sk_pacing_shift);
+	limit = min_t(unsigned long, limit,
 		      sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
 	limit <<= factor;
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 338222a6c664..3923d1409533 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -92,8 +92,8 @@ struct fq_sched_data {
 	u32		quantum;
 	u32		initial_quantum;
 	u32		flow_refill_delay;
-	u32		flow_max_rate;	/* optional max rate per flow */
 	u32		flow_plimit;	/* max packets per flow */
+	unsigned long	flow_max_rate;	/* optional max rate per flow */
 	u32		orphan_mask;	/* mask for orphaned skb */
 	u32		low_rate_threshold;
 	struct rb_root	*fq_root;
@@ -416,7 +416,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 	struct fq_flow_head *head;
 	struct sk_buff *skb;
 	struct fq_flow *f;
-	u32 rate, plen;
+	unsigned long rate;
+	u32 plen;
 
 	skb = fq_dequeue_head(sch, &q->internal);
 	if (skb)
@@ -485,11 +486,11 @@ begin:
 		if (f->credit > 0)
 			goto out;
 	}
-	if (rate != ~0U) {
+	if (rate != ~0UL) {
 		u64 len = (u64)plen * NSEC_PER_SEC;
 
 		if (likely(rate))
-			do_div(len, rate);
+			len = div64_ul(len, rate);
 		/* Since socket rate can change later,
 		 * clamp the delay to 1 second.
 		 * Really, providers of too big packets should be fixed !
@@ -701,9 +702,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 		pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
 				    nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
 
-	if (tb[TCA_FQ_FLOW_MAX_RATE])
-		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+	if (tb[TCA_FQ_FLOW_MAX_RATE]) {
+		u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
 
+		q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
+	}
 	if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
 		q->low_rate_threshold =
 			nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
@@ -766,7 +769,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
 	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
 	q->flow_refill_delay	= msecs_to_jiffies(40);
-	q->flow_max_rate	= ~0U;
+	q->flow_max_rate	= ~0UL;
 	q->time_next_delayed_flow = ~0ULL;
 	q->rate_enable		= 1;
 	q->new_flows.first	= NULL;
@@ -802,7 +805,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
 	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
 	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
-	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
+			min_t(unsigned long, q->flow_max_rate, ~0U)) ||
 	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
 			jiffies_to_usecs(q->flow_refill_delay)) ||
 	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
-- 
cgit v1.2.3


From 22e6c58b8c2843337ec4e8464b1ce6e869ca5bf4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 15 Oct 2018 18:56:41 -0700
Subject: netlink: Add answer_flags to netlink_callback

With dump filtering we need a way to ensure the NLM_F_DUMP_FILTERED
flag is set on a message back to the user if the data returned is
influenced by some input attributes. Normally this can be done as
messages are added to the skb, but if the filter results in no data
being returned, the user could be confused as to why.

This patch adds answer_flags to the netlink_callback allowing dump
handlers to set the NLM_F_DUMP_FILTERED at a minimum in the
NLMSG_DONE message ensuring the flag gets back to the user.

The netlink_callback space is initialized to 0 via a memset in
__netlink_dump_start, so init of the new answer_flags is covered.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  | 1 +
 net/netlink/af_netlink.c | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 72580f1a72a2..4da90a6ab536 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -180,6 +180,7 @@ struct netlink_callback {
 	u16			family;
 	u16			min_dump_alloc;
 	bool			strict_check;
+	u16			answer_flags;
 	unsigned int		prev_seq, seq;
 	long			args[6];
 };
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e613a9f89600..6bb9f3cde0b0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2257,7 +2257,8 @@ static int netlink_dump(struct sock *sk)
 	}
 
 	nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE,
-			       sizeof(nlk->dump_done_errno), NLM_F_MULTI);
+			       sizeof(nlk->dump_done_errno),
+			       NLM_F_MULTI | cb->answer_flags);
 	if (WARN_ON(!nlh))
 		goto errout_skb;
 
-- 
cgit v1.2.3


From 4724676d551c0961659b1da3fb4b5928169fb184 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 15 Oct 2018 18:56:42 -0700
Subject: net: Add struct for fib dump filter

Add struct fib_dump_filter for options on limiting which routes are
returned in a dump request. The current list is table id, protocol,
route type, rtm_flags and nexthop device index. struct net is needed
to lookup the net_device from the index.

Declare the filter for each route dump handler and plumb the new
arguments from dump handlers to ip_valid_fib_dump_req.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  1 +
 include/net/ip_fib.h    | 13 ++++++++++++-
 net/ipv4/fib_frontend.c |  6 ++++--
 net/ipv4/ipmr.c         |  6 +++++-
 net/ipv6/ip6_fib.c      |  5 +++--
 net/ipv6/ip6mr.c        |  5 ++++-
 net/mpls/af_mpls.c      | 12 ++++++++----
 7 files changed, 37 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index cef186dbd2ce..7ab119936e69 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -174,6 +174,7 @@ struct rt6_rtnl_dump_arg {
 	struct sk_buff *skb;
 	struct netlink_callback *cb;
 	struct net *net;
+	struct fib_dump_filter filter;
 };
 
 int rt6_dump_route(struct fib6_info *f6i, void *p_arg);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 852e4ebf2209..667013bf4266 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -222,6 +222,16 @@ struct fib_table {
 	unsigned long		__data[0];
 };
 
+struct fib_dump_filter {
+	u32			table_id;
+	/* filter_set is an optimization that an entry is set */
+	bool			filter_set;
+	unsigned char		protocol;
+	unsigned char		rt_type;
+	unsigned int		flags;
+	struct net_device	*dev;
+};
+
 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		     struct fib_result *res, int fib_flags);
 int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
@@ -453,6 +463,7 @@ static inline void fib_proc_exit(struct net *net)
 
 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);
 
-int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack);
 #endif  /* _NET_FIB_H */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 0f1beceb47d5..850850dd80e1 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -802,7 +802,8 @@ errout:
 	return err;
 }
 
-int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack)
 {
 	struct rtmsg *rtm;
@@ -837,6 +838,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
+	struct fib_dump_filter filter = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
 	struct fib_table *tb;
@@ -844,7 +846,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	int dumped = 0, err;
 
 	if (cb->strict_check) {
-		err = ip_valid_fib_dump_req(nlh, cb->extack);
+		err = ip_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 91b0d5671649..44d777058960 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2527,9 +2527,13 @@ errout_free:
 
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct fib_dump_filter filter = {};
+
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(cb->nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
+					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 0783af11b0b7..94e61fe47ff8 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -569,17 +569,18 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
+	struct rt6_rtnl_dump_arg arg = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
-	struct rt6_rtnl_dump_arg arg;
 	struct fib6_walker *w;
 	struct fib6_table *tb;
 	struct hlist_head *head;
 	int res = 0;
 
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d7563ef76518..dbd5166c5599 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2458,10 +2458,13 @@ errout:
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
+	struct fib_dump_filter filter = {};
 
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
+					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 5fe274c47c41..bfcb4759c9ee 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2032,13 +2032,15 @@ nla_put_failure:
 }
 
 #if IS_ENABLED(CONFIG_INET)
-static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh,
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
-	return ip_valid_fib_dump_req(nlh, extack);
+	return ip_valid_fib_dump_req(net, nlh, filter, extack);
 }
 #else
-static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh,
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
 	struct rtmsg *rtm;
@@ -2070,14 +2072,16 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	struct mpls_route __rcu **platform_label;
+	struct fib_dump_filter filter = {};
 	size_t platform_labels;
 	unsigned int index;
 
 	ASSERT_RTNL();
 
 	if (cb->strict_check) {
-		int err = mpls_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = mpls_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
-- 
cgit v1.2.3


From 18a8021a7be3207686851208f91a2f105b2d4703 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 15 Oct 2018 18:56:43 -0700
Subject: net/ipv4: Plumb support for filtering route dumps

Implement kernel side filtering of routes by table id, egress device index,
protocol and route type. If the table id is given in the filter, lookup the
table and call fib_table_dump directly for it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  2 +-
 net/ipv4/fib_frontend.c | 13 ++++++++++++-
 net/ipv4/fib_trie.c     | 37 ++++++++++++++++++++++++++-----------
 3 files changed, 39 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 667013bf4266..1eabc9edd2b9 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -239,7 +239,7 @@ int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
 int fib_table_delete(struct net *, struct fib_table *, struct fib_config *,
 		     struct netlink_ext_ack *extack);
 int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
-		   struct netlink_callback *cb);
+		   struct netlink_callback *cb, struct fib_dump_filter *filter);
 int fib_table_flush(struct net *net, struct fib_table *table);
 struct fib_table *fib_trie_unmerge(struct fib_table *main_tb);
 void fib_table_flush_external(struct fib_table *table);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 850850dd80e1..37dc8ac366fd 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -855,6 +855,17 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	    ((struct rtmsg *)nlmsg_data(nlh))->rtm_flags & RTM_F_CLONED)
 		return skb->len;
 
+	if (filter.table_id) {
+		tb = fib_get_table(net, filter.table_id);
+		if (!tb) {
+			NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
+			return -ENOENT;
+		}
+
+		err = fib_table_dump(tb, skb, cb, &filter);
+		return skb->len ? : err;
+	}
+
 	s_h = cb->args[0];
 	s_e = cb->args[1];
 
@@ -869,7 +880,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 			if (dumped)
 				memset(&cb->args[2], 0, sizeof(cb->args) -
 						 2 * sizeof(cb->args[0]));
-			err = fib_table_dump(tb, skb, cb);
+			err = fib_table_dump(tb, skb, cb, &filter);
 			if (err < 0) {
 				if (likely(skb->len))
 					goto out;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5bc0c89e81e4..237c9f72b265 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2003,12 +2003,17 @@ void fib_free_table(struct fib_table *tb)
 }
 
 static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
-			     struct sk_buff *skb, struct netlink_callback *cb)
+			     struct sk_buff *skb, struct netlink_callback *cb,
+			     struct fib_dump_filter *filter)
 {
+	unsigned int flags = NLM_F_MULTI;
 	__be32 xkey = htonl(l->key);
 	struct fib_alias *fa;
 	int i, s_i;
 
+	if (filter->filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
 	s_i = cb->args[4];
 	i = 0;
 
@@ -2016,25 +2021,35 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
 		int err;
 
-		if (i < s_i) {
-			i++;
-			continue;
-		}
+		if (i < s_i)
+			goto next;
 
-		if (tb->tb_id != fa->tb_id) {
-			i++;
-			continue;
+		if (tb->tb_id != fa->tb_id)
+			goto next;
+
+		if (filter->filter_set) {
+			if (filter->rt_type && fa->fa_type != filter->rt_type)
+				goto next;
+
+			if ((filter->protocol &&
+			     fa->fa_info->fib_protocol != filter->protocol))
+				goto next;
+
+			if (filter->dev &&
+			    !fib_info_nh_uses_dev(fa->fa_info, filter->dev))
+				goto next;
 		}
 
 		err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
 				    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 				    tb->tb_id, fa->fa_type,
 				    xkey, KEYLENGTH - fa->fa_slen,
-				    fa->fa_tos, fa->fa_info, NLM_F_MULTI);
+				    fa->fa_tos, fa->fa_info, flags);
 		if (err < 0) {
 			cb->args[4] = i;
 			return err;
 		}
+next:
 		i++;
 	}
 
@@ -2044,7 +2059,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 
 /* rcu_read_lock needs to be hold by caller from readside */
 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
-		   struct netlink_callback *cb)
+		   struct netlink_callback *cb, struct fib_dump_filter *filter)
 {
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct key_vector *l, *tp = t->kv;
@@ -2057,7 +2072,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
 		int err;
 
-		err = fn_trie_dump_leaf(l, tb, skb, cb);
+		err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
 		if (err < 0) {
 			cb->args[3] = key;
 			cb->args[2] = count;
-- 
cgit v1.2.3


From e1cedae1ba6b09ae8376c1486712bf91ea0dfc41 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 15 Oct 2018 18:56:46 -0700
Subject: ipmr: Refactor mr_rtm_dumproute

Move per-table loops from mr_rtm_dumproute to mr_table_dump and export
mr_table_dump for dumps by specific table id.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute_base.h |  6 ++++
 net/ipv4/ipmr_base.c        | 88 ++++++++++++++++++++++++++++-----------------
 2 files changed, 61 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 6675b9f81979..db85373c8d15 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -283,6 +283,12 @@ void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg);
 
 int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 		   struct mr_mfc *c, struct rtmsg *rtm);
+int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
+		  struct netlink_callback *cb,
+		  int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
+			      u32 portid, u32 seq, struct mr_mfc *c,
+			      int cmd, int flags),
+		  spinlock_t *lock);
 int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		     struct mr_table *(*iter)(struct net *net,
 					      struct mr_table *mrt),
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 1ad9aa62a97b..132dd2613ca5 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -268,6 +268,55 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(mr_fill_mroute);
 
+int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
+		  struct netlink_callback *cb,
+		  int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
+			      u32 portid, u32 seq, struct mr_mfc *c,
+			      int cmd, int flags),
+		  spinlock_t *lock)
+{
+	unsigned int e = 0, s_e = cb->args[1];
+	unsigned int flags = NLM_F_MULTI;
+	struct mr_mfc *mfc;
+	int err;
+
+	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+		if (e < s_e)
+			goto next_entry;
+
+		err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+			   cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+		if (err < 0)
+			goto out;
+next_entry:
+		e++;
+	}
+	e = 0;
+	s_e = 0;
+
+	spin_lock_bh(lock);
+	list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+		if (e < s_e)
+			goto next_entry2;
+
+		err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+			   cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+		if (err < 0) {
+			spin_unlock_bh(lock);
+			goto out;
+		}
+next_entry2:
+		e++;
+	}
+	spin_unlock_bh(lock);
+	err = 0;
+	e = 0;
+
+out:
+	cb->args[1] = e;
+	return err;
+}
+
 int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		     struct mr_table *(*iter)(struct net *net,
 					      struct mr_table *mrt),
@@ -277,51 +326,24 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 int cmd, int flags),
 		     spinlock_t *lock)
 {
-	unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1];
+	unsigned int t = 0, s_t = cb->args[0];
 	struct net *net = sock_net(skb->sk);
 	struct mr_table *mrt;
-	struct mr_mfc *mfc;
+	int err;
 
 	rcu_read_lock();
 	for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
 		if (t < s_t)
 			goto next_table;
-		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
-			if (e < s_e)
-				goto next_entry;
-			if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
-				 cb->nlh->nlmsg_seq, mfc,
-				 RTM_NEWROUTE, NLM_F_MULTI) < 0)
-				goto done;
-next_entry:
-			e++;
-		}
-		e = 0;
-		s_e = 0;
-
-		spin_lock_bh(lock);
-		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
-			if (e < s_e)
-				goto next_entry2;
-			if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
-				 cb->nlh->nlmsg_seq, mfc,
-				 RTM_NEWROUTE, NLM_F_MULTI) < 0) {
-				spin_unlock_bh(lock);
-				goto done;
-			}
-next_entry2:
-			e++;
-		}
-		spin_unlock_bh(lock);
-		e = 0;
-		s_e = 0;
+
+		err = mr_table_dump(mrt, skb, cb, fill, lock);
+		if (err < 0)
+			break;
 next_table:
 		t++;
 	}
-done:
 	rcu_read_unlock();
 
-	cb->args[1] = e;
 	cb->args[0] = t;
 
 	return skb->len;
-- 
cgit v1.2.3


From cb167893f41e21e6bd283d78e53489289dc0592d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 15 Oct 2018 18:56:47 -0700
Subject: net: Plumb support for filtering ipv4 and ipv6 multicast route dumps

Implement kernel side filtering of routes by egress device index and
table id. If the table id is given in the filter, lookup table and
call mr_table_dump directly for it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute_base.h |  7 ++++---
 net/ipv4/ipmr.c             | 18 +++++++++++++++---
 net/ipv4/ipmr_base.c        | 43 ++++++++++++++++++++++++++++++++++++++++---
 net/ipv6/ip6mr.c            | 18 +++++++++++++++---
 4 files changed, 74 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index db85373c8d15..34de06b426ef 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -7,6 +7,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/fib_notifier.h>
+#include <net/ip_fib.h>
 
 /**
  * struct vif_device - interface representor for multicast routing
@@ -288,7 +289,7 @@ int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
 		  int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
 			      u32 portid, u32 seq, struct mr_mfc *c,
 			      int cmd, int flags),
-		  spinlock_t *lock);
+		  spinlock_t *lock, struct fib_dump_filter *filter);
 int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		     struct mr_table *(*iter)(struct net *net,
 					      struct mr_table *mrt),
@@ -296,7 +297,7 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 struct sk_buff *skb,
 				 u32 portid, u32 seq, struct mr_mfc *c,
 				 int cmd, int flags),
-		     spinlock_t *lock);
+		     spinlock_t *lock, struct fib_dump_filter *filter);
 
 int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 	    int (*rules_dump)(struct net *net,
@@ -346,7 +347,7 @@ mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 			     struct sk_buff *skb,
 			     u32 portid, u32 seq, struct mr_mfc *c,
 			     int cmd, int flags),
-		 spinlock_t *lock)
+		 spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	return -EINVAL;
 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 44d777058960..3fa988e6a3df 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2528,18 +2528,30 @@ errout_free:
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct fib_dump_filter filter = {};
+	int err;
 
 	if (cb->strict_check) {
-		int err;
-
 		err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
 					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
 
+	if (filter.table_id) {
+		struct mr_table *mrt;
+
+		mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id);
+		if (!mrt) {
+			NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist");
+			return -ENOENT;
+		}
+		err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute,
+				    &mfc_unres_lock, &filter);
+		return skb->len ? : err;
+	}
+
 	return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
-				_ipmr_fill_mroute, &mfc_unres_lock);
+				_ipmr_fill_mroute, &mfc_unres_lock, &filter);
 }
 
 static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 132dd2613ca5..844806120f44 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -268,21 +268,45 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(mr_fill_mroute);
 
+static bool mr_mfc_uses_dev(const struct mr_table *mrt,
+			    const struct mr_mfc *c,
+			    const struct net_device *dev)
+{
+	int ct;
+
+	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+			const struct vif_device *vif;
+
+			vif = &mrt->vif_table[ct];
+			if (vif->dev == dev)
+				return true;
+		}
+	}
+	return false;
+}
+
 int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
 		  struct netlink_callback *cb,
 		  int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
 			      u32 portid, u32 seq, struct mr_mfc *c,
 			      int cmd, int flags),
-		  spinlock_t *lock)
+		  spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	unsigned int e = 0, s_e = cb->args[1];
 	unsigned int flags = NLM_F_MULTI;
 	struct mr_mfc *mfc;
 	int err;
 
+	if (filter->filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
 	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
 		if (e < s_e)
 			goto next_entry;
+		if (filter->dev &&
+		    !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+			goto next_entry;
 
 		err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
 			   cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
@@ -298,6 +322,9 @@ next_entry:
 	list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
 		if (e < s_e)
 			goto next_entry2;
+		if (filter->dev &&
+		    !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+			goto next_entry2;
 
 		err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
 			   cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
@@ -316,6 +343,7 @@ out:
 	cb->args[1] = e;
 	return err;
 }
+EXPORT_SYMBOL(mr_table_dump);
 
 int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		     struct mr_table *(*iter)(struct net *net,
@@ -324,19 +352,28 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 struct sk_buff *skb,
 				 u32 portid, u32 seq, struct mr_mfc *c,
 				 int cmd, int flags),
-		     spinlock_t *lock)
+		     spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	unsigned int t = 0, s_t = cb->args[0];
 	struct net *net = sock_net(skb->sk);
 	struct mr_table *mrt;
 	int err;
 
+	/* multicast does not track protocol or have route type other
+	 * than RTN_MULTICAST
+	 */
+	if (filter->filter_set) {
+		if (filter->protocol || filter->flags ||
+		    (filter->rt_type && filter->rt_type != RTN_MULTICAST))
+			return skb->len;
+	}
+
 	rcu_read_lock();
 	for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
 		if (t < s_t)
 			goto next_table;
 
-		err = mr_table_dump(mrt, skb, cb, fill, lock);
+		err = mr_table_dump(mrt, skb, cb, fill, lock, filter);
 		if (err < 0)
 			break;
 next_table:
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index dbd5166c5599..9759b0aecdd6 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2459,16 +2459,28 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct fib_dump_filter filter = {};
+	int err;
 
 	if (cb->strict_check) {
-		int err;
-
 		err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
 					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
 
+	if (filter.table_id) {
+		struct mr_table *mrt;
+
+		mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id);
+		if (!mrt) {
+			NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist");
+			return -ENOENT;
+		}
+		err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute,
+				    &mfc_unres_lock, &filter);
+		return skb->len ? : err;
+	}
+
 	return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
-				_ip6mr_fill_mroute, &mfc_unres_lock);
+				_ip6mr_fill_mroute, &mfc_unres_lock, &filter);
 }
-- 
cgit v1.2.3


From effe6792662495ad9c175bf0d9c53459a51fdbbd Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 15 Oct 2018 18:56:48 -0700
Subject: net: Enable kernel side filtering of route dumps

Update parsing of route dump request to enable kernel side filtering.
Allow filtering results by protocol (e.g., which routing daemon installed
the route), route type (e.g., unicast), table id and nexthop device. These
amount to the low hanging fruit, yet a huge improvement, for dumping
routes.

ip_valid_fib_dump_req is called with RTNL held, so __dev_get_by_index can
be used to look up the device index without taking a reference. From
there filter->dev is only used during dump loops with the lock still held.

Set NLM_F_DUMP_FILTERED in the answer_flags so the user knows the results
have been filtered should no entries be returned.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  2 +-
 net/ipv4/fib_frontend.c | 51 ++++++++++++++++++++++++++++++++++++++++++-------
 net/ipv4/ipmr.c         |  2 +-
 net/ipv6/ip6_fib.c      |  2 +-
 net/ipv6/ip6mr.c        |  2 +-
 net/mpls/af_mpls.c      |  9 +++++----
 6 files changed, 53 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1eabc9edd2b9..e8d9456bf36e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -465,5 +465,5 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);
 
 int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			  struct fib_dump_filter *filter,
-			  struct netlink_ext_ack *extack);
+			  struct netlink_callback *cb);
 #endif  /* _NET_FIB_H */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 37dc8ac366fd..e86ca2255181 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -804,9 +804,14 @@ errout:
 
 int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			  struct fib_dump_filter *filter,
-			  struct netlink_ext_ack *extack)
+			  struct netlink_callback *cb)
 {
+	struct netlink_ext_ack *extack = cb->extack;
+	struct nlattr *tb[RTA_MAX + 1];
 	struct rtmsg *rtm;
+	int err, i;
+
+	ASSERT_RTNL();
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 		NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
@@ -815,8 +820,7 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 
 	rtm = nlmsg_data(nlh);
 	if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
-	    rtm->rtm_table   || rtm->rtm_protocol || rtm->rtm_scope ||
-	    rtm->rtm_type) {
+	    rtm->rtm_scope) {
 		NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
 		return -EINVAL;
 	}
@@ -825,9 +829,42 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 		return -EINVAL;
 	}
 
-	if (nlmsg_attrlen(nlh, sizeof(*rtm))) {
-		NL_SET_ERR_MSG(extack, "Invalid data after header in FIB dump request");
-		return -EINVAL;
+	filter->flags    = rtm->rtm_flags;
+	filter->protocol = rtm->rtm_protocol;
+	filter->rt_type  = rtm->rtm_type;
+	filter->table_id = rtm->rtm_table;
+
+	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+				 rtm_ipv4_policy, extack);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= RTA_MAX; ++i) {
+		int ifindex;
+
+		if (!tb[i])
+			continue;
+
+		switch (i) {
+		case RTA_TABLE:
+			filter->table_id = nla_get_u32(tb[i]);
+			break;
+		case RTA_OIF:
+			ifindex = nla_get_u32(tb[i]);
+			filter->dev = __dev_get_by_index(net, ifindex);
+			if (!filter->dev)
+				return -ENODEV;
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+			return -EINVAL;
+		}
+	}
+
+	if (filter->flags || filter->protocol || filter->rt_type ||
+	    filter->table_id || filter->dev) {
+		filter->filter_set = 1;
+		cb->answer_flags = NLM_F_DUMP_FILTERED;
 	}
 
 	return 0;
@@ -846,7 +883,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	int dumped = 0, err;
 
 	if (cb->strict_check) {
-		err = ip_valid_fib_dump_req(net, nlh, &filter, cb->extack);
+		err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3fa988e6a3df..7a3e2acda94c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2532,7 +2532,7 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 
 	if (cb->strict_check) {
 		err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
-					    &filter, cb->extack);
+					    &filter, cb);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index a51fc357a05c..5562c77022c6 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -580,7 +580,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	if (cb->strict_check) {
 		int err;
 
-		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb->extack);
+		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
 		if (err < 0)
 			return err;
 	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 9759b0aecdd6..c3317ffb09eb 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2463,7 +2463,7 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 
 	if (cb->strict_check) {
 		err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
-					    &filter, cb->extack);
+					    &filter, cb);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 48f4cbd9fb38..24381696932a 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2034,15 +2034,16 @@ nla_put_failure:
 #if IS_ENABLED(CONFIG_INET)
 static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 				   struct fib_dump_filter *filter,
-				   struct netlink_ext_ack *extack)
+				   struct netlink_callback *cb)
 {
-	return ip_valid_fib_dump_req(net, nlh, filter, extack);
+	return ip_valid_fib_dump_req(net, nlh, filter, cb);
 }
 #else
 static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 				   struct fib_dump_filter *filter,
-				   struct netlink_ext_ack *extack)
+				   struct netlink_callback *cb)
 {
+	struct netlink_ext_ack *extack = cb->extack;
 	struct rtmsg *rtm;
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
@@ -2104,7 +2105,7 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 	if (cb->strict_check) {
 		int err;
 
-		err = mpls_valid_fib_dump_req(net, nlh, &filter, cb->extack);
+		err = mpls_valid_fib_dump_req(net, nlh, &filter, cb);
 		if (err < 0)
 			return err;
 
-- 
cgit v1.2.3


From a218dc82f0b5c6c8ad3d58c9870ed69e26c08b3e Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Wed, 10 Oct 2018 09:57:13 +0200
Subject: netfilter: nft_osf: Add ttl option support

Add ttl option support to the nftables "osf" expression.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink_osf.h  |  3 ++-
 include/uapi/linux/netfilter/nf_tables.h |  7 +++++
 net/netfilter/nfnetlink_osf.c            | 46 +++++++++++++++-----------------
 net/netfilter/nft_osf.c                  | 15 ++++++++++-
 4 files changed, 44 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h
index ecf7dab81e9e..c6000046c966 100644
--- a/include/linux/netfilter/nfnetlink_osf.h
+++ b/include/linux/netfilter/nfnetlink_osf.h
@@ -27,6 +27,7 @@ bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 		  const struct list_head *nf_osf_fingers);
 
 const char *nf_osf_find(const struct sk_buff *skb,
-                        const struct list_head *nf_osf_fingers);
+			const struct list_head *nf_osf_fingers,
+			const int ttl_check);
 
 #endif /* _NFOSF_H */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5444e76870bb..579974b0bf0d 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1511,9 +1511,16 @@ enum nft_flowtable_hook_attributes {
 };
 #define NFTA_FLOWTABLE_HOOK_MAX	(__NFTA_FLOWTABLE_HOOK_MAX - 1)
 
+/**
+ * enum nft_osf_attributes - nftables osf expression netlink attributes
+ *
+ * @NFTA_OSF_DREG: destination register (NLA_U32: nft_registers)
+ * @NFTA_OSF_TTL: Value of the TTL osf option (NLA_U8)
+ */
 enum nft_osf_attributes {
 	NFTA_OSF_UNSPEC,
 	NFTA_OSF_DREG,
+	NFTA_OSF_TTL,
 	__NFTA_OSF_MAX,
 };
 #define NFTA_OSF_MAX (__NFTA_OSF_MAX - 1)
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 00db27dfd2ff..6f41dd74729d 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -30,32 +30,27 @@ EXPORT_SYMBOL_GPL(nf_osf_fingers);
 static inline int nf_osf_ttl(const struct sk_buff *skb,
 			     int ttl_check, unsigned char f_ttl)
 {
+	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 	const struct iphdr *ip = ip_hdr(skb);
-
-	if (ttl_check != -1) {
-		if (ttl_check == NF_OSF_TTL_TRUE)
-			return ip->ttl == f_ttl;
-		if (ttl_check == NF_OSF_TTL_NOCHECK)
-			return 1;
-		else if (ip->ttl <= f_ttl)
-			return 1;
-		else {
-			struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
-			int ret = 0;
-
-			for_ifa(in_dev) {
-				if (inet_ifa_match(ip->saddr, ifa)) {
-					ret = (ip->ttl == f_ttl);
-					break;
-				}
-			}
-			endfor_ifa(in_dev);
-
-			return ret;
+	int ret = 0;
+
+	if (ttl_check == NF_OSF_TTL_TRUE)
+		return ip->ttl == f_ttl;
+	if (ttl_check == NF_OSF_TTL_NOCHECK)
+		return 1;
+	else if (ip->ttl <= f_ttl)
+		return 1;
+
+	for_ifa(in_dev) {
+		if (inet_ifa_match(ip->saddr, ifa)) {
+			ret = (ip->ttl == f_ttl);
+			break;
 		}
 	}
 
-	return ip->ttl == f_ttl;
+	endfor_ifa(in_dev);
+
+	return ret;
 }
 
 struct nf_osf_hdr_ctx {
@@ -213,7 +208,7 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 	if (!tcp)
 		return false;
 
-	ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1;
+	ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : 0;
 
 	list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
 
@@ -257,7 +252,8 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 EXPORT_SYMBOL_GPL(nf_osf_match);
 
 const char *nf_osf_find(const struct sk_buff *skb,
-			const struct list_head *nf_osf_fingers)
+			const struct list_head *nf_osf_fingers,
+			const int ttl_check)
 {
 	const struct iphdr *ip = ip_hdr(skb);
 	const struct nf_osf_user_finger *f;
@@ -275,7 +271,7 @@ const char *nf_osf_find(const struct sk_buff *skb,
 
 	list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
 		f = &kf->finger;
-		if (!nf_osf_match_one(skb, f, -1, &ctx))
+		if (!nf_osf_match_one(skb, f, ttl_check, &ctx))
 			continue;
 
 		genre = f->genre;
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index a35fb59ace73..0b452fd470c4 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -6,10 +6,12 @@
 
 struct nft_osf {
 	enum nft_registers	dreg:8;
+	u8			ttl;
 };
 
 static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = {
 	[NFTA_OSF_DREG]		= { .type = NLA_U32 },
+	[NFTA_OSF_TTL]		= { .type = NLA_U8 },
 };
 
 static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
@@ -33,7 +35,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
 		return;
 	}
 
-	os_name = nf_osf_find(skb, nf_osf_fingers);
+	os_name = nf_osf_find(skb, nf_osf_fingers, priv->ttl);
 	if (!os_name)
 		strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
 	else
@@ -46,6 +48,14 @@ static int nft_osf_init(const struct nft_ctx *ctx,
 {
 	struct nft_osf *priv = nft_expr_priv(expr);
 	int err;
+	u8 ttl;
+
+	if (nla_get_u8(tb[NFTA_OSF_TTL])) {
+		ttl = nla_get_u8(tb[NFTA_OSF_TTL]);
+		if (ttl > 2)
+			return -EINVAL;
+		priv->ttl = ttl;
+	}
 
 	priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
 	err = nft_validate_register_store(ctx, priv->dreg, NULL,
@@ -60,6 +70,9 @@ static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_osf *priv = nft_expr_priv(expr);
 
+	if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl))
+		goto nla_put_failure;
+
 	if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From c56a8be7e7aa855ebcccf0e9d9eba2216514d399 Mon Sep 17 00:00:00 2001
From: Rahul Verma <Rahul.Verma@cavium.com>
Date: Tue, 16 Oct 2018 03:59:20 -0700
Subject: qed: Add supported link and advertise link to display in ethtool.

	Added transceiver type, speed capability and board types
	in HSI, are utilizing to display the accurate link
	information in ethtool.

Signed-off-by: Rahul Verma <Rahul.Verma@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c      | 199 ++++++++++++++++++------
 drivers/net/ethernet/qlogic/qed/qed_mcp.c       | 182 ++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h       |  46 ++++++
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  31 +++-
 include/linux/qed/qed_if.h                      |  26 +++-
 5 files changed, 426 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 8c7cbbde65a6..e762881fdb38 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -58,6 +58,7 @@
 #include "qed_iscsi.h"
 
 #include "qed_mcp.h"
+#include "qed_reg_addr.h"
 #include "qed_hw.h"
 #include "qed_selftest.h"
 #include "qed_debug.h"
@@ -1330,8 +1331,7 @@ static int qed_set_link(struct qed_dev *cdev, struct qed_link_params *params)
 		link_params->speed.autoneg = params->autoneg;
 	if (params->override_flags & QED_LINK_OVERRIDE_SPEED_ADV_SPEEDS) {
 		link_params->speed.advertised_speeds = 0;
-		if ((params->adv_speeds & QED_LM_1000baseT_Half_BIT) ||
-		    (params->adv_speeds & QED_LM_1000baseT_Full_BIT))
+		if (params->adv_speeds & QED_LM_1000baseT_Full_BIT)
 			link_params->speed.advertised_speeds |=
 			    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G;
 		if (params->adv_speeds & QED_LM_10000baseKR_Full_BIT)
@@ -1462,13 +1462,149 @@ static int qed_get_link_data(struct qed_hwfn *hwfn,
 	return 0;
 }
 
+static void qed_fill_link_capability(struct qed_hwfn *hwfn,
+				     struct qed_ptt *ptt, u32 capability,
+				     u32 *if_capability)
+{
+	u32 media_type, tcvr_state, tcvr_type;
+	u32 speed_mask, board_cfg;
+
+	if (qed_mcp_get_media_type(hwfn, ptt, &media_type))
+		media_type = MEDIA_UNSPECIFIED;
+
+	if (qed_mcp_get_transceiver_data(hwfn, ptt, &tcvr_state, &tcvr_type))
+		tcvr_type = ETH_TRANSCEIVER_STATE_UNPLUGGED;
+
+	if (qed_mcp_trans_speed_mask(hwfn, ptt, &speed_mask))
+		speed_mask = 0xFFFFFFFF;
+
+	if (qed_mcp_get_board_config(hwfn, ptt, &board_cfg))
+		board_cfg = NVM_CFG1_PORT_PORT_TYPE_UNDEFINED;
+
+	DP_VERBOSE(hwfn->cdev, NETIF_MSG_DRV,
+		   "Media_type = 0x%x tcvr_state = 0x%x tcvr_type = 0x%x speed_mask = 0x%x board_cfg = 0x%x\n",
+		   media_type, tcvr_state, tcvr_type, speed_mask, board_cfg);
+
+	switch (media_type) {
+	case MEDIA_DA_TWINAX:
+		if (capability & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_20G)
+			*if_capability |= QED_LM_20000baseKR2_Full_BIT;
+		/* For DAC media multiple speed capabilities are supported*/
+		capability = capability & speed_mask;
+		if (capability & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G)
+			*if_capability |= QED_LM_1000baseKX_Full_BIT;
+		if (capability & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G)
+			*if_capability |= QED_LM_10000baseCR_Full_BIT;
+		if (capability & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G)
+			*if_capability |= QED_LM_40000baseCR4_Full_BIT;
+		if (capability & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G)
+			*if_capability |= QED_LM_25000baseCR_Full_BIT;
+		if (capability & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G)
+			*if_capability |= QED_LM_50000baseCR2_Full_BIT;
+		if (capability &
+			NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G)
+			*if_capability |= QED_LM_100000baseCR4_Full_BIT;
+		break;
+	case MEDIA_BASE_T:
+		if (board_cfg & NVM_CFG1_PORT_PORT_TYPE_EXT_PHY) {
+			if (capability &
+			    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G) {
+				*if_capability |= QED_LM_1000baseT_Full_BIT;
+			}
+			if (capability &
+			    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G) {
+				*if_capability |= QED_LM_10000baseT_Full_BIT;
+			}
+		}
+		if (board_cfg & NVM_CFG1_PORT_PORT_TYPE_MODULE) {
+			if (tcvr_type == ETH_TRANSCEIVER_TYPE_1000BASET)
+				*if_capability |= QED_LM_1000baseT_Full_BIT;
+			if (tcvr_type == ETH_TRANSCEIVER_TYPE_10G_BASET)
+				*if_capability |= QED_LM_10000baseT_Full_BIT;
+		}
+		break;
+	case MEDIA_SFP_1G_FIBER:
+	case MEDIA_SFPP_10G_FIBER:
+	case MEDIA_XFP_FIBER:
+	case MEDIA_MODULE_FIBER:
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G) {
+			if ((tcvr_type == ETH_TRANSCEIVER_TYPE_1G_LX) ||
+			    (tcvr_type == ETH_TRANSCEIVER_TYPE_1G_SX))
+				*if_capability |= QED_LM_1000baseKX_Full_BIT;
+		}
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G) {
+			if (tcvr_type == ETH_TRANSCEIVER_TYPE_10G_SR)
+				*if_capability |= QED_LM_10000baseSR_Full_BIT;
+			if (tcvr_type == ETH_TRANSCEIVER_TYPE_10G_LR)
+				*if_capability |= QED_LM_10000baseLR_Full_BIT;
+			if (tcvr_type == ETH_TRANSCEIVER_TYPE_10G_LRM)
+				*if_capability |= QED_LM_10000baseLRM_Full_BIT;
+			if (tcvr_type == ETH_TRANSCEIVER_TYPE_10G_ER)
+				*if_capability |= QED_LM_10000baseR_FEC_BIT;
+		}
+		if (capability & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_20G)
+			*if_capability |= QED_LM_20000baseKR2_Full_BIT;
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G) {
+			if (tcvr_type == ETH_TRANSCEIVER_TYPE_25G_SR)
+				*if_capability |= QED_LM_25000baseSR_Full_BIT;
+		}
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G) {
+			if (tcvr_type == ETH_TRANSCEIVER_TYPE_40G_LR4)
+				*if_capability |= QED_LM_40000baseLR4_Full_BIT;
+			if (tcvr_type == ETH_TRANSCEIVER_TYPE_40G_SR4)
+				*if_capability |= QED_LM_40000baseSR4_Full_BIT;
+		}
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G)
+			*if_capability |= QED_LM_50000baseKR2_Full_BIT;
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G) {
+			if (tcvr_type == ETH_TRANSCEIVER_TYPE_100G_SR4)
+				*if_capability |= QED_LM_100000baseSR4_Full_BIT;
+		}
+
+		break;
+	case MEDIA_KR:
+		if (capability & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_20G)
+			*if_capability |= QED_LM_20000baseKR2_Full_BIT;
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G)
+			*if_capability |= QED_LM_1000baseKX_Full_BIT;
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G)
+			*if_capability |= QED_LM_10000baseKR_Full_BIT;
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G)
+			*if_capability |= QED_LM_25000baseKR_Full_BIT;
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G)
+			*if_capability |= QED_LM_40000baseKR4_Full_BIT;
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G)
+			*if_capability |= QED_LM_50000baseKR2_Full_BIT;
+		if (capability &
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G)
+			*if_capability |= QED_LM_100000baseKR4_Full_BIT;
+		break;
+	case MEDIA_UNSPECIFIED:
+	case MEDIA_NOT_PRESENT:
+		DP_VERBOSE(hwfn->cdev, QED_MSG_DEBUG,
+			   "Unknown media and transceiver type;\n");
+		break;
+	}
+}
+
 static void qed_fill_link(struct qed_hwfn *hwfn,
 			  struct qed_ptt *ptt,
 			  struct qed_link_output *if_link)
 {
+	struct qed_mcp_link_capabilities link_caps;
 	struct qed_mcp_link_params params;
 	struct qed_mcp_link_state link;
-	struct qed_mcp_link_capabilities link_caps;
 	u32 media_type;
 
 	memset(if_link, 0, sizeof(*if_link));
@@ -1499,51 +1635,13 @@ static void qed_fill_link(struct qed_hwfn *hwfn,
 		if_link->advertised_caps |= QED_LM_Autoneg_BIT;
 	else
 		if_link->advertised_caps &= ~QED_LM_Autoneg_BIT;
-	if (params.speed.advertised_speeds &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G)
-		if_link->advertised_caps |= QED_LM_1000baseT_Half_BIT |
-		    QED_LM_1000baseT_Full_BIT;
-	if (params.speed.advertised_speeds &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G)
-		if_link->advertised_caps |= QED_LM_10000baseKR_Full_BIT;
-	if (params.speed.advertised_speeds &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_20G)
-		if_link->advertised_caps |= QED_LM_20000baseKR2_Full_BIT;
-	if (params.speed.advertised_speeds &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G)
-		if_link->advertised_caps |= QED_LM_25000baseKR_Full_BIT;
-	if (params.speed.advertised_speeds &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G)
-		if_link->advertised_caps |= QED_LM_40000baseLR4_Full_BIT;
-	if (params.speed.advertised_speeds &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G)
-		if_link->advertised_caps |= QED_LM_50000baseKR2_Full_BIT;
-	if (params.speed.advertised_speeds &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G)
-		if_link->advertised_caps |= QED_LM_100000baseKR4_Full_BIT;
-
-	if (link_caps.speed_capabilities &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G)
-		if_link->supported_caps |= QED_LM_1000baseT_Half_BIT |
-		    QED_LM_1000baseT_Full_BIT;
-	if (link_caps.speed_capabilities &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G)
-		if_link->supported_caps |= QED_LM_10000baseKR_Full_BIT;
-	if (link_caps.speed_capabilities &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_20G)
-		if_link->supported_caps |= QED_LM_20000baseKR2_Full_BIT;
-	if (link_caps.speed_capabilities &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G)
-		if_link->supported_caps |= QED_LM_25000baseKR_Full_BIT;
-	if (link_caps.speed_capabilities &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G)
-		if_link->supported_caps |= QED_LM_40000baseLR4_Full_BIT;
-	if (link_caps.speed_capabilities &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G)
-		if_link->supported_caps |= QED_LM_50000baseKR2_Full_BIT;
-	if (link_caps.speed_capabilities &
-	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G)
-		if_link->supported_caps |= QED_LM_100000baseKR4_Full_BIT;
+
+	/* Fill link advertised capability*/
+	qed_fill_link_capability(hwfn, ptt, params.speed.advertised_speeds,
+				 &if_link->advertised_caps);
+	/* Fill link supported capability*/
+	qed_fill_link_capability(hwfn, ptt, link_caps.speed_capabilities,
+				 &if_link->supported_caps);
 
 	if (link.link_up)
 		if_link->speed = link.speed;
@@ -1563,9 +1661,8 @@ static void qed_fill_link(struct qed_hwfn *hwfn,
 		if_link->pause_config |= QED_LINK_PAUSE_TX_ENABLE;
 
 	/* Link partner capabilities */
-	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_1G_HD)
-		if_link->lp_caps |= QED_LM_1000baseT_Half_BIT;
-	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_1G_FD)
+	if (link.partner_adv_speed &
+	    QED_LINK_PARTNER_SPEED_1G_FD)
 		if_link->lp_caps |= QED_LM_1000baseT_Full_BIT;
 	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_10G)
 		if_link->lp_caps |= QED_LM_10000baseKR_Full_BIT;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 92c5950ad156..554d57ac1629 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1870,6 +1870,8 @@ int qed_mcp_get_mbi_ver(struct qed_hwfn *p_hwfn,
 int qed_mcp_get_media_type(struct qed_hwfn *p_hwfn,
 			   struct qed_ptt *p_ptt, u32 *p_media_type)
 {
+	*p_media_type = MEDIA_UNSPECIFIED;
+
 	if (IS_VF(p_hwfn->cdev))
 		return -EINVAL;
 
@@ -1891,6 +1893,186 @@ int qed_mcp_get_media_type(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
+int qed_mcp_get_transceiver_data(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *p_transceiver_state,
+				 u32 *p_transceiver_type)
+{
+	u32 transceiver_info;
+
+	if (IS_VF(p_hwfn->cdev))
+		return -EINVAL;
+
+	if (!qed_mcp_is_init(p_hwfn)) {
+		DP_NOTICE(p_hwfn, "MFW is not initialized!\n");
+		return -EBUSY;
+	}
+
+	*p_transceiver_type = ETH_TRANSCEIVER_TYPE_NONE;
+	*p_transceiver_state = ETH_TRANSCEIVER_STATE_UPDATING;
+
+	transceiver_info = qed_rd(p_hwfn, p_ptt,
+				  p_hwfn->mcp_info->port_addr +
+				  offsetof(struct public_port,
+					   transceiver_data));
+
+	*p_transceiver_state = (transceiver_info &
+				ETH_TRANSCEIVER_STATE_MASK) >>
+				ETH_TRANSCEIVER_STATE_OFFSET;
+
+	if (*p_transceiver_state == ETH_TRANSCEIVER_STATE_PRESENT)
+		*p_transceiver_type = (transceiver_info &
+				       ETH_TRANSCEIVER_TYPE_MASK) >>
+				       ETH_TRANSCEIVER_TYPE_OFFSET;
+	else
+		*p_transceiver_type = ETH_TRANSCEIVER_TYPE_UNKNOWN;
+
+	return 0;
+}
+static bool qed_is_transceiver_ready(u32 transceiver_state,
+				     u32 transceiver_type)
+{
+	if ((transceiver_state & ETH_TRANSCEIVER_STATE_PRESENT) &&
+	    ((transceiver_state & ETH_TRANSCEIVER_STATE_UPDATING) == 0x0) &&
+	    (transceiver_type != ETH_TRANSCEIVER_TYPE_NONE))
+		return true;
+
+	return false;
+}
+
+int qed_mcp_trans_speed_mask(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, u32 *p_speed_mask)
+{
+	u32 transceiver_type, transceiver_state;
+
+	qed_mcp_get_transceiver_data(p_hwfn, p_ptt, &transceiver_state,
+				     &transceiver_type);
+
+	if (qed_is_transceiver_ready(transceiver_state, transceiver_type) ==
+				     false)
+		return -EINVAL;
+
+	switch (transceiver_type) {
+	case ETH_TRANSCEIVER_TYPE_1G_LX:
+	case ETH_TRANSCEIVER_TYPE_1G_SX:
+	case ETH_TRANSCEIVER_TYPE_1G_PCC:
+	case ETH_TRANSCEIVER_TYPE_1G_ACC:
+	case ETH_TRANSCEIVER_TYPE_1000BASET:
+		*p_speed_mask = NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G;
+		break;
+	case ETH_TRANSCEIVER_TYPE_10G_SR:
+	case ETH_TRANSCEIVER_TYPE_10G_LR:
+	case ETH_TRANSCEIVER_TYPE_10G_LRM:
+	case ETH_TRANSCEIVER_TYPE_10G_ER:
+	case ETH_TRANSCEIVER_TYPE_10G_PCC:
+	case ETH_TRANSCEIVER_TYPE_10G_ACC:
+	case ETH_TRANSCEIVER_TYPE_4x10G:
+		*p_speed_mask = NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G;
+		break;
+	case ETH_TRANSCEIVER_TYPE_40G_LR4:
+	case ETH_TRANSCEIVER_TYPE_40G_SR4:
+	case ETH_TRANSCEIVER_TYPE_MULTI_RATE_10G_40G_SR:
+	case ETH_TRANSCEIVER_TYPE_MULTI_RATE_10G_40G_LR:
+		*p_speed_mask = NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G;
+		break;
+	case ETH_TRANSCEIVER_TYPE_100G_AOC:
+	case ETH_TRANSCEIVER_TYPE_100G_SR4:
+	case ETH_TRANSCEIVER_TYPE_100G_LR4:
+	case ETH_TRANSCEIVER_TYPE_100G_ER4:
+	case ETH_TRANSCEIVER_TYPE_100G_ACC:
+		*p_speed_mask =
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G;
+		break;
+	case ETH_TRANSCEIVER_TYPE_25G_SR:
+	case ETH_TRANSCEIVER_TYPE_25G_LR:
+	case ETH_TRANSCEIVER_TYPE_25G_AOC:
+	case ETH_TRANSCEIVER_TYPE_25G_ACC_S:
+	case ETH_TRANSCEIVER_TYPE_25G_ACC_M:
+	case ETH_TRANSCEIVER_TYPE_25G_ACC_L:
+		*p_speed_mask = NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G;
+		break;
+	case ETH_TRANSCEIVER_TYPE_25G_CA_N:
+	case ETH_TRANSCEIVER_TYPE_25G_CA_S:
+	case ETH_TRANSCEIVER_TYPE_25G_CA_L:
+	case ETH_TRANSCEIVER_TYPE_4x25G_CR:
+		*p_speed_mask = NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G;
+		break;
+	case ETH_TRANSCEIVER_TYPE_40G_CR4:
+	case ETH_TRANSCEIVER_TYPE_MULTI_RATE_10G_40G_CR:
+		*p_speed_mask = NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G;
+		break;
+	case ETH_TRANSCEIVER_TYPE_100G_CR4:
+	case ETH_TRANSCEIVER_TYPE_MULTI_RATE_40G_100G_CR:
+		*p_speed_mask =
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_20G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G;
+		break;
+	case ETH_TRANSCEIVER_TYPE_MULTI_RATE_40G_100G_SR:
+	case ETH_TRANSCEIVER_TYPE_MULTI_RATE_40G_100G_LR:
+	case ETH_TRANSCEIVER_TYPE_MULTI_RATE_40G_100G_AOC:
+		*p_speed_mask =
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G;
+		break;
+	case ETH_TRANSCEIVER_TYPE_XLPPI:
+		*p_speed_mask = NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G;
+		break;
+	case ETH_TRANSCEIVER_TYPE_10G_BASET:
+		*p_speed_mask = NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G |
+		    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G;
+		break;
+	default:
+		DP_INFO(p_hwfn, "Unknown transcevier type 0x%x\n",
+			transceiver_type);
+		*p_speed_mask = 0xff;
+		break;
+	}
+
+	return 0;
+}
+
+int qed_mcp_get_board_config(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, u32 *p_board_config)
+{
+	u32 nvm_cfg_addr, nvm_cfg1_offset, port_cfg_addr;
+
+	if (IS_VF(p_hwfn->cdev))
+		return -EINVAL;
+
+	if (!qed_mcp_is_init(p_hwfn)) {
+		DP_NOTICE(p_hwfn, "MFW is not initialized!\n");
+		return -EBUSY;
+	}
+	if (!p_ptt) {
+		*p_board_config = NVM_CFG1_PORT_PORT_TYPE_UNDEFINED;
+		return -EINVAL;
+	}
+
+	nvm_cfg_addr = qed_rd(p_hwfn, p_ptt, MISC_REG_GEN_PURP_CR0);
+	nvm_cfg1_offset = qed_rd(p_hwfn, p_ptt, nvm_cfg_addr + 4);
+	port_cfg_addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
+			offsetof(struct nvm_cfg1, port[MFW_PORT(p_hwfn)]);
+	*p_board_config = qed_rd(p_hwfn, p_ptt,
+				 port_cfg_addr +
+				 offsetof(struct nvm_cfg1_port,
+					  board_cfg));
+
+	return 0;
+}
+
 /* Old MFW has a global configuration for all PFs regarding RDMA support */
 static void
 qed_mcp_get_shmem_proto_legacy(struct qed_hwfn *p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 80a6b5d1ff33..1adfe52b3905 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -332,6 +332,52 @@ int qed_mcp_get_mbi_ver(struct qed_hwfn *p_hwfn,
 int qed_mcp_get_media_type(struct qed_hwfn *p_hwfn,
 			   struct qed_ptt *p_ptt, u32 *media_type);
 
+/**
+ * @brief Get transceiver data of the port.
+ *
+ * @param cdev      - qed dev pointer
+ * @param p_ptt
+ * @param p_transceiver_state - transceiver state.
+ * @param p_transceiver_type - media type value
+ *
+ * @return int -
+ *      0 - Operation was successful.
+ *      -EBUSY - Operation failed
+ */
+int qed_mcp_get_transceiver_data(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *p_transceiver_state,
+				 u32 *p_tranceiver_type);
+
+/**
+ * @brief Get transceiver supported speed mask.
+ *
+ * @param cdev      - qed dev pointer
+ * @param p_ptt
+ * @param p_speed_mask - Bit mask of all supported speeds.
+ *
+ * @return int -
+ *      0 - Operation was successful.
+ *      -EBUSY - Operation failed
+ */
+
+int qed_mcp_trans_speed_mask(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, u32 *p_speed_mask);
+
+/**
+ * @brief Get board configuration.
+ *
+ * @param cdev      - qed dev pointer
+ * @param p_ptt
+ * @param p_board_config - Board config.
+ *
+ * @return int -
+ *      0 - Operation was successful.
+ *      -EBUSY - Operation failed
+ */
+int qed_mcp_get_board_config(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, u32 *p_board_config);
+
 /**
  * @brief General function for sending commands to the MCP
  *        mailbox. It acquire mutex lock for the entire
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 7ff50b4488f6..df3ad591140d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -413,19 +413,42 @@ struct qede_link_mode_mapping {
 };
 
 static const struct qede_link_mode_mapping qed_lm_map[] = {
-	{QED_LM_FIBRE_BIT, ETHTOOL_LINK_MODE_FIBRE_BIT},
 	{QED_LM_Autoneg_BIT, ETHTOOL_LINK_MODE_Autoneg_BIT},
 	{QED_LM_Asym_Pause_BIT, ETHTOOL_LINK_MODE_Asym_Pause_BIT},
 	{QED_LM_Pause_BIT, ETHTOOL_LINK_MODE_Pause_BIT},
-	{QED_LM_1000baseT_Half_BIT, ETHTOOL_LINK_MODE_1000baseT_Half_BIT},
 	{QED_LM_1000baseT_Full_BIT, ETHTOOL_LINK_MODE_1000baseT_Full_BIT},
+	{QED_LM_10000baseT_Full_BIT, ETHTOOL_LINK_MODE_10000baseT_Full_BIT},
+	{QED_LM_2500baseX_Full_BIT, ETHTOOL_LINK_MODE_2500baseX_Full_BIT},
+	{QED_LM_Backplane_BIT, ETHTOOL_LINK_MODE_Backplane_BIT},
+	{QED_LM_1000baseKX_Full_BIT, ETHTOOL_LINK_MODE_1000baseKX_Full_BIT},
+	{QED_LM_10000baseKX4_Full_BIT, ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT},
 	{QED_LM_10000baseKR_Full_BIT, ETHTOOL_LINK_MODE_10000baseKR_Full_BIT},
+	{QED_LM_10000baseKR_Full_BIT, ETHTOOL_LINK_MODE_10000baseKR_Full_BIT},
+	{QED_LM_10000baseR_FEC_BIT, ETHTOOL_LINK_MODE_10000baseR_FEC_BIT},
 	{QED_LM_20000baseKR2_Full_BIT, ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT},
-	{QED_LM_25000baseKR_Full_BIT, ETHTOOL_LINK_MODE_25000baseKR_Full_BIT},
+	{QED_LM_40000baseKR4_Full_BIT, ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT},
+	{QED_LM_40000baseCR4_Full_BIT, ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT},
+	{QED_LM_40000baseSR4_Full_BIT, ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT},
 	{QED_LM_40000baseLR4_Full_BIT, ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT},
+	{QED_LM_25000baseCR_Full_BIT, ETHTOOL_LINK_MODE_25000baseCR_Full_BIT},
+	{QED_LM_25000baseKR_Full_BIT, ETHTOOL_LINK_MODE_25000baseKR_Full_BIT},
+	{QED_LM_25000baseSR_Full_BIT, ETHTOOL_LINK_MODE_25000baseSR_Full_BIT},
+	{QED_LM_50000baseCR2_Full_BIT, ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT},
 	{QED_LM_50000baseKR2_Full_BIT, ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT},
 	{QED_LM_100000baseKR4_Full_BIT,
-	 ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT},
+		ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT},
+	{QED_LM_100000baseSR4_Full_BIT,
+		ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT},
+	{QED_LM_100000baseCR4_Full_BIT,
+		ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT},
+	{QED_LM_100000baseLR4_ER4_Full_BIT,
+		ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT},
+	{QED_LM_50000baseSR2_Full_BIT, ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT},
+	{QED_LM_1000baseX_Full_BIT, ETHTOOL_LINK_MODE_1000baseX_Full_BIT},
+	{QED_LM_10000baseCR_Full_BIT, ETHTOOL_LINK_MODE_10000baseCR_Full_BIT},
+	{QED_LM_10000baseSR_Full_BIT, ETHTOOL_LINK_MODE_10000baseSR_Full_BIT},
+	{QED_LM_10000baseLR_Full_BIT, ETHTOOL_LINK_MODE_10000baseLR_Full_BIT},
+	{QED_LM_10000baseLRM_Full_BIT, ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT},
 };
 
 #define QEDE_DRV_TO_ETHTOOL_CAPS(caps, lk_ksettings, name)	\
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index dee3c9c744f7..a47321a0d572 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -667,15 +667,35 @@ enum qed_link_mode_bits {
 	QED_LM_Autoneg_BIT = BIT(1),
 	QED_LM_Asym_Pause_BIT = BIT(2),
 	QED_LM_Pause_BIT = BIT(3),
-	QED_LM_1000baseT_Half_BIT = BIT(4),
-	QED_LM_1000baseT_Full_BIT = BIT(5),
+	QED_LM_1000baseT_Full_BIT = BIT(4),
+	QED_LM_10000baseT_Full_BIT = BIT(5),
 	QED_LM_10000baseKR_Full_BIT = BIT(6),
 	QED_LM_20000baseKR2_Full_BIT = BIT(7),
 	QED_LM_25000baseKR_Full_BIT = BIT(8),
 	QED_LM_40000baseLR4_Full_BIT = BIT(9),
 	QED_LM_50000baseKR2_Full_BIT = BIT(10),
 	QED_LM_100000baseKR4_Full_BIT = BIT(11),
-	QED_LM_COUNT = 11
+	QED_LM_2500baseX_Full_BIT = BIT(12),
+	QED_LM_Backplane_BIT = BIT(13),
+	QED_LM_1000baseKX_Full_BIT = BIT(14),
+	QED_LM_10000baseKX4_Full_BIT = BIT(15),
+	QED_LM_10000baseR_FEC_BIT = BIT(16),
+	QED_LM_40000baseKR4_Full_BIT = BIT(17),
+	QED_LM_40000baseCR4_Full_BIT = BIT(18),
+	QED_LM_40000baseSR4_Full_BIT = BIT(19),
+	QED_LM_25000baseCR_Full_BIT = BIT(20),
+	QED_LM_25000baseSR_Full_BIT = BIT(21),
+	QED_LM_50000baseCR2_Full_BIT = BIT(22),
+	QED_LM_100000baseSR4_Full_BIT = BIT(23),
+	QED_LM_100000baseCR4_Full_BIT = BIT(24),
+	QED_LM_100000baseLR4_ER4_Full_BIT = BIT(25),
+	QED_LM_50000baseSR2_Full_BIT = BIT(26),
+	QED_LM_1000baseX_Full_BIT = BIT(27),
+	QED_LM_10000baseCR_Full_BIT = BIT(28),
+	QED_LM_10000baseSR_Full_BIT = BIT(29),
+	QED_LM_10000baseLR_Full_BIT = BIT(30),
+	QED_LM_10000baseLRM_Full_BIT = BIT(31),
+	QED_LM_COUNT = 32
 };
 
 struct qed_link_params {
-- 
cgit v1.2.3


From 94a04d1d3d3681adde1a3e022b25dbac7b345b7e Mon Sep 17 00:00:00 2001
From: Yonatan Cohen <yonatanc@mellanox.com>
Date: Tue, 9 Oct 2018 12:05:12 +0300
Subject: net/mlx5: Expose DC scatter to CQE capability bit

dc_req_scat_data_cqe capability bit determines
if requester scatter to cqe is available for 64 bytes CQE over
DC transport type.

Signed-off-by: Yonatan Cohen <yonatanc@mellanox.com>
Reviewed-by: Guy Levi <guyle@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 68f4d5f9d929..0f460fb22c31 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1005,7 +1005,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         umr_modify_atomic_disabled[0x1];
 	u8         umr_indirect_mkey_disabled[0x1];
 	u8         umr_fence[0x2];
-	u8         reserved_at_20c[0x3];
+	u8         dc_req_scat_data_cqe[0x1];
+	u8         reserved_at_20d[0x2];
 	u8         drain_sigerr[0x1];
 	u8         cmdif_checksum[0x2];
 	u8         sigerr_cqe[0x1];
-- 
cgit v1.2.3


From 3f4c3127d332000530349db4843deece27fe5e0c Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Oct 2018 10:36:01 -0700
Subject: bpf: sockmap, fix skmsg recvmsg handler to track size correctly

When converting sockmap to new skmsg generic data structures we missed
that the recvmsg handler did not correctly use sg.size and instead was
using individual elements length. The result is if a sock is closed
with outstanding data we omit the call to sk_mem_uncharge() and can
get the warning below.

[   66.728282] WARNING: CPU: 6 PID: 5783 at net/core/stream.c:206 sk_stream_kill_queues+0x1fa/0x210

To fix this correct the redirect handler to xfer the size along with
the scatterlist and also decrement the size from the recvmsg handler.
Now when a sock is closed the remaining 'size' will be decremented
with sk_mem_uncharge().

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skmsg.h | 1 +
 net/ipv4/tcp_bpf.c    | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 0b919f0bc6d6..31df0d9fa536 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -176,6 +176,7 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src,
 {
 	dst->sg.data[which] = src->sg.data[which];
 	dst->sg.data[which].length  = size;
+	dst->sg.size		   += size;
 	src->sg.data[which].length -= size;
 	src->sg.data[which].offset += size;
 }
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 80debb0daf37..f9d3cf185827 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -73,6 +73,7 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 			sge->offset += copy;
 			sge->length -= copy;
 			sk_mem_uncharge(sk, copy);
+			msg_rx->sg.size -= copy;
 			if (!sge->length) {
 				i++;
 				if (i == MAX_SKB_FRAGS)
-- 
cgit v1.2.3


From 8734a162c13b1a893e7dff8de0df81fed04c51a6 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Oct 2018 11:07:59 -0700
Subject: bpf: skmsg, improve sk_msg_used_element to work in cork context

Currently sk_msg_used_element is only called in zerocopy context where
cork is not possible and if this case happens we fallback to copy
mode. However the helper is more useful if it works in all contexts.

This patch resolved the case where if end == head indicating a full
or empty ring the helper always reports an empty ring. To fix this
add a test for the full ring case to avoid reporting a full ring
has 0 elements. This additional functionality will be used in the
next patches from recvmsg context where end = head with a full ring
is a valid case.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skmsg.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 31df0d9fa536..22347b08e1f8 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -187,18 +187,21 @@ static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src)
 	sk_msg_init(src);
 }
 
+static inline bool sk_msg_full(const struct sk_msg *msg)
+{
+	return (msg->sg.end == msg->sg.start) && msg->sg.size;
+}
+
 static inline u32 sk_msg_elem_used(const struct sk_msg *msg)
 {
+	if (sk_msg_full(msg))
+		return MAX_MSG_FRAGS;
+
 	return msg->sg.end >= msg->sg.start ?
 		msg->sg.end - msg->sg.start :
 		msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start);
 }
 
-static inline bool sk_msg_full(const struct sk_msg *msg)
-{
-	return (msg->sg.end == msg->sg.start) && msg->sg.size;
-}
-
 static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which)
 {
 	return &msg->sg.data[which];
-- 
cgit v1.2.3


From 02c558b2d5d679fbbcaa5b9689484c7e0f8abb7b Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 16 Oct 2018 11:08:04 -0700
Subject: bpf: sockmap, support for msg_peek in sk_msg with redirect ingress

This adds support for the MSG_PEEK flag when doing redirect to ingress
and receiving on the sk_msg psock queue. Previously the flag was
being ignored which could confuse applications if they expected the
flag to work as normal.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/tcp.h  |  2 +-
 net/ipv4/tcp_bpf.c | 42 +++++++++++++++++++++++++++---------------
 net/tls/tls_sw.c   |  3 ++-
 3 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3600ae0f25c3..14fdd7ce9992 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2089,7 +2089,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
 int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		    int nonblock, int flags, int *addr_len);
 int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
-		      struct msghdr *msg, int len);
+		      struct msghdr *msg, int len, int flags);
 
 /* Call BPF_SOCK_OPS program that returns an int. If the return value
  * is < 0, then the BPF op failed (for example if the loaded BPF
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index f9d3cf185827..b7918d4caa30 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -39,17 +39,19 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
 }
 
 int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
-		      struct msghdr *msg, int len)
+		      struct msghdr *msg, int len, int flags)
 {
 	struct iov_iter *iter = &msg->msg_iter;
+	int peek = flags & MSG_PEEK;
 	int i, ret, copied = 0;
+	struct sk_msg *msg_rx;
+
+	msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+					  struct sk_msg, list);
 
 	while (copied != len) {
 		struct scatterlist *sge;
-		struct sk_msg *msg_rx;
 
-		msg_rx = list_first_entry_or_null(&psock->ingress_msg,
-						  struct sk_msg, list);
 		if (unlikely(!msg_rx))
 			break;
 
@@ -70,22 +72,30 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 			}
 
 			copied += copy;
-			sge->offset += copy;
-			sge->length -= copy;
-			sk_mem_uncharge(sk, copy);
-			msg_rx->sg.size -= copy;
-			if (!sge->length) {
-				i++;
-				if (i == MAX_SKB_FRAGS)
-					i = 0;
-				if (!msg_rx->skb)
-					put_page(page);
+			if (likely(!peek)) {
+				sge->offset += copy;
+				sge->length -= copy;
+				sk_mem_uncharge(sk, copy);
+				msg_rx->sg.size -= copy;
+
+				if (!sge->length) {
+					sk_msg_iter_var_next(i);
+					if (!msg_rx->skb)
+						put_page(page);
+				}
+			} else {
+				sk_msg_iter_var_next(i);
 			}
 
 			if (copied == len)
 				break;
 		} while (i != msg_rx->sg.end);
 
+		if (unlikely(peek)) {
+			msg_rx = list_next_entry(msg_rx, list);
+			continue;
+		}
+
 		msg_rx->sg.start = i;
 		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
 			list_del(&msg_rx->list);
@@ -93,6 +103,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 				consume_skb(msg_rx->skb);
 			kfree(msg_rx);
 		}
+		msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+						  struct sk_msg, list);
 	}
 
 	return copied;
@@ -115,7 +127,7 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 	lock_sock(sk);
 msg_bytes_ready:
-	copied = __tcp_bpf_recvmsg(sk, psock, msg, len);
+	copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
 	if (!copied) {
 		int data, err = 0;
 		long timeo;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index a525fc4c2a4b..5cd88ba8acd1 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1478,7 +1478,8 @@ int tls_sw_recvmsg(struct sock *sk,
 		skb = tls_wait_data(sk, psock, flags, timeo, &err);
 		if (!skb) {
 			if (psock) {
-				int ret = __tcp_bpf_recvmsg(sk, psock, msg, len);
+				int ret = __tcp_bpf_recvmsg(sk, psock,
+							    msg, len, flags);
 
 				if (ret > 0) {
 					copied += ret;
-- 
cgit v1.2.3


From b8aee82250b7d90a32b11ba208656f52dbaca342 Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Tue, 2 Oct 2018 22:57:24 +0000
Subject: net/mlx5: E-Switch, Get counters for offloaded flows from callers

There's no real reason for the e-switch logic to manage the creation of
counters for offloaded flows. The API already has the directive for the
caller to denote they want to attach a counter to the created flow.
As such, we go and move the management of flow counters to the mlx5e
tc offload logic. This also lets us remove an inelegant interface where
the FS layer had to provide a way to retrieve a counter from a flow rule.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 32 ++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  1 +
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 20 ++------------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 15 ----------
 include/linux/mlx5/fs.h                            |  1 -
 5 files changed, 33 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index acf7a847f561..8a27c0813a18 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -61,6 +61,7 @@ struct mlx5_nic_flow_attr {
 	u32 hairpin_tirn;
 	u8 match_level;
 	struct mlx5_flow_table	*hairpin_ft;
+	struct mlx5_fc		*counter;
 };
 
 #define MLX5E_TC_FLOW_BASE (MLX5E_TC_LAST_EXPORTED_BIT + 1)
@@ -721,6 +722,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 		dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
 		dest[dest_ix].counter = counter;
 		dest_ix++;
+		attr->counter = counter;
 	}
 
 	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
@@ -797,7 +799,7 @@ static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv,
 	struct mlx5_nic_flow_attr *attr = flow->nic_attr;
 	struct mlx5_fc *counter = NULL;
 
-	counter = mlx5_flow_rule_counter(flow->rule[0]);
+	counter = attr->counter;
 	mlx5_del_flow_rules(flow->rule[0]);
 	mlx5_fc_destroy(priv->mdev, counter);
 
@@ -833,6 +835,7 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
 	struct net_device *out_dev, *encap_dev = NULL;
 	struct mlx5_flow_handle *rule = NULL;
+	struct mlx5_fc *counter = NULL;
 	struct mlx5e_rep_priv *rpriv;
 	struct mlx5e_priv *out_priv;
 	int err;
@@ -868,6 +871,16 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 		}
 	}
 
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		counter = mlx5_fc_create(esw->dev, true);
+		if (IS_ERR(counter)) {
+			rule = ERR_CAST(counter);
+			goto err_create_counter;
+		}
+
+		attr->counter = counter;
+	}
+
 	/* we get here if (1) there's no error (rule being null) or when
 	 * (2) there's an encap action and we're on -EAGAIN (no valid neigh)
 	 */
@@ -888,6 +901,8 @@ err_fwd_rule:
 	mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
 	rule = flow->rule[1];
 err_add_rule:
+	mlx5_fc_destroy(esw->dev, counter);
+err_create_counter:
 	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
 		mlx5e_detach_mod_hdr(priv, flow);
 err_mod_hdr:
@@ -921,6 +936,9 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 
 	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
 		mlx5e_detach_mod_hdr(priv, flow);
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
+		mlx5_fc_destroy(esw->dev, attr->counter);
 }
 
 void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
@@ -992,6 +1010,14 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 	}
 }
 
+static struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow)
+{
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		return flow->esw_attr->counter;
+	else
+		return flow->nic_attr->counter;
+}
+
 void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 {
 	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
@@ -1017,7 +1043,7 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 			continue;
 		list_for_each_entry(flow, &e->flows, encap) {
 			if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
-				counter = mlx5_flow_rule_counter(flow->rule[0]);
+				counter = mlx5e_tc_get_counter(flow);
 				mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
 				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
 					neigh_used = true;
@@ -3019,7 +3045,7 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
 	if (!(flow->flags & MLX5E_TC_FLOW_OFFLOADED))
 		return 0;
 
-	counter = mlx5_flow_rule_counter(flow->rule[0]);
+	counter = mlx5e_tc_get_counter(flow);
 	if (!counter)
 		return 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index dfc642de4e6d..c1b627577003 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -266,6 +266,7 @@ struct mlx5_esw_flow_attr {
 	u32	encap_id;
 	u32	mod_hdr_id;
 	u8	match_level;
+	struct mlx5_fc *counter;
 	struct mlx5e_tc_flow_parse_attr *parse_attr;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 0741683f7d70..a2f2d726c99b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -51,7 +51,6 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
 	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_table *ft = NULL;
-	struct mlx5_fc *counter = NULL;
 	struct mlx5_flow_handle *rule;
 	int j, i = 0;
 	void *misc;
@@ -91,13 +90,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 		}
 	}
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
-		counter = mlx5_fc_create(esw->dev, true);
-		if (IS_ERR(counter)) {
-			rule = ERR_CAST(counter);
-			goto err_counter_alloc;
-		}
 		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		dest[i].counter = counter;
+		dest[i].counter = attr->counter;
 		i++;
 	}
 
@@ -132,15 +126,11 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 
 	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, i);
 	if (IS_ERR(rule))
-		goto err_add_rule;
+		goto out;
 	else
 		esw->offloads.num_flows++;
 
-	return rule;
-
-err_add_rule:
-	mlx5_fc_destroy(esw->dev, counter);
-err_counter_alloc:
+out:
 	return rule;
 }
 
@@ -200,11 +190,7 @@ mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_handle *rule,
 				struct mlx5_esw_flow_attr *attr)
 {
-	struct mlx5_fc *counter = NULL;
-
-	counter = mlx5_flow_rule_counter(rule);
 	mlx5_del_flow_rules(rule);
-	mlx5_fc_destroy(esw->dev, counter);
 	esw->offloads.num_flows--;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 8d340e5181f8..9e18e6c0a8b3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1474,21 +1474,6 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 	return handle;
 }
 
-struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handle)
-{
-	struct mlx5_flow_rule *dst;
-	struct fs_fte *fte;
-
-	fs_get_obj(fte, handle->rule[0]->node.parent);
-
-	fs_for_each_dst(dst, fte) {
-		if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
-			return dst->dest_attr.counter;
-	}
-
-	return NULL;
-}
-
 static bool counter_is_valid(struct mlx5_fc *counter, u32 action)
 {
 	if (!(action & MLX5_FLOW_CONTEXT_ACTION_COUNT))
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index b1c026f1c8ba..74d0ea146c9a 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -186,7 +186,6 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
 				 struct mlx5_flow_destination *new_dest,
 				 struct mlx5_flow_destination *old_dest);
 
-struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handler);
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
-- 
cgit v1.2.3


From 171c7625bef999848ee6032c6dde96e7330c4d15 Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Wed, 3 Oct 2018 00:03:35 +0000
Subject: net/mlx5: Use flow counter IDs and not the wrapping cache object

Currently, when a flow rule is created using the FS core layer, the caller
has to pass the entire flow counter object and not just the counter HW
handle (ID). This requires both the FS core and the caller to have
knowledge about the inner implementation of the FS layer flow counters
cache and limits the possible users.

Move to use the counter ID across the place when dealing with flows.

Doing this decoupling, now can we privatize the inner implementation
of the flow counters.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c                            |  7 +++++--
 drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c              |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c            |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c             |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c            | 10 ++--------
 drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c        |  6 ++++++
 include/linux/mlx5/fs.h                                      |  3 ++-
 9 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 5d9b7f62a0ba..5ced0cc46ba1 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3320,15 +3320,18 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 	}
 
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		struct mlx5_ib_mcounters *mcounters;
+
 		err = flow_counters_set_data(flow_act.counters, ucmd);
 		if (err)
 			goto free;
 
+		mcounters = to_mcounters(flow_act.counters);
 		handler->ibcounters = flow_act.counters;
 		dest_arr[dest_num].type =
 			MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		dest_arr[dest_num].counter =
-			to_mcounters(flow_act.counters)->hw_cntrs_hndl;
+		dest_arr[dest_num].counter_id =
+			mlx5_fc_id(mcounters->hw_cntrs_hndl);
 		dest_num++;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
index e83dda441a81..d027ce00c8ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
@@ -252,10 +252,10 @@ TRACE_EVENT(mlx5_fs_add_rule,
 			   memcpy(__entry->destination,
 				  &rule->dest_attr,
 				  sizeof(__entry->destination));
-			   if (rule->dest_attr.type & MLX5_FLOW_DESTINATION_TYPE_COUNTER &&
-			       rule->dest_attr.counter)
+			   if (rule->dest_attr.type &
+			       MLX5_FLOW_DESTINATION_TYPE_COUNTER)
 				__entry->counter_id =
-				rule->dest_attr.counter->id;
+					rule->dest_attr.counter_id;
 	    ),
 	    TP_printk("rule=%p fte=%p index=%u sw_action=<%s> [dst] %s\n",
 		      __entry->rule, __entry->fte, __entry->index,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 8a27c0813a18..5ce87f54852d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -720,7 +720,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 			goto err_fc_create;
 		}
 		dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		dest[dest_ix].counter = counter;
+		dest[dest_ix].counter_id = mlx5_fc_id(counter);
 		dest_ix++;
 		attr->counter = counter;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index e1d47fa5ab83..9c893d7d273e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1198,7 +1198,7 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 	if (counter) {
 		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
 		drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		drop_ctr_dst.counter = counter;
+		drop_ctr_dst.counter_id = mlx5_fc_id(counter);
 		dst = &drop_ctr_dst;
 		dest_num++;
 	}
@@ -1285,7 +1285,7 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
 	if (counter) {
 		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
 		drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		drop_ctr_dst.counter = counter;
+		drop_ctr_dst.counter_id = mlx5_fc_id(counter);
 		dst = &drop_ctr_dst;
 		dest_num++;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index a2f2d726c99b..39932dce15cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -91,7 +91,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	}
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
 		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		dest[i].counter = attr->counter;
+		dest[i].counter_id = mlx5_fc_id(attr->counter);
 		i++;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index dc8d7f6b52c2..08a891f9aade 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -419,7 +419,7 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 				continue;
 
 			MLX5_SET(flow_counter_list, in_dests, flow_counter_id,
-				 dst->dest_attr.counter->id);
+				 dst->dest_attr.counter_id);
 			in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
 			list_size++;
 		}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 9e18e6c0a8b3..cdcbf9d0ae6c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1474,14 +1474,8 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 	return handle;
 }
 
-static bool counter_is_valid(struct mlx5_fc *counter, u32 action)
+static bool counter_is_valid(u32 action)
 {
-	if (!(action & MLX5_FLOW_CONTEXT_ACTION_COUNT))
-		return !counter;
-
-	if (!counter)
-		return false;
-
 	return (action & (MLX5_FLOW_CONTEXT_ACTION_DROP |
 			  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST));
 }
@@ -1491,7 +1485,7 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
 			  struct mlx5_flow_table *ft)
 {
 	if (dest && (dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER))
-		return counter_is_valid(dest->counter, action);
+		return counter_is_valid(action);
 
 	if (!(action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
 		return true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 09206c4acd9a..1329bc5b7969 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -258,6 +258,12 @@ err_out:
 }
 EXPORT_SYMBOL(mlx5_fc_create);
 
+u32 mlx5_fc_id(struct mlx5_fc *counter)
+{
+	return counter->id;
+}
+EXPORT_SYMBOL(mlx5_fc_id);
+
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
 {
 	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 74d0ea146c9a..a5fc62184195 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -92,7 +92,7 @@ struct mlx5_flow_destination {
 		u32			tir_num;
 		u32			ft_num;
 		struct mlx5_flow_table	*ft;
-		struct mlx5_fc		*counter;
+		u32			counter_id;
 		struct {
 			u16		num;
 			u16		vhca_id;
@@ -192,6 +192,7 @@ void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse);
 int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
 		  u64 *packets, u64 *bytes);
+u32 mlx5_fc_id(struct mlx5_fc *counter);
 
 int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
 int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
-- 
cgit v1.2.3


From b9aa0ba17af5afa13605eb6ea91f1974da97a2e2 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Thu, 31 May 2018 11:50:23 +0300
Subject: net/mlx5: Add cap bits for multi fdb encap

If set, the firmware supports creating of flow tables with encap
enabled while VFs are configured, if we already created one
(restriction still applies on the first creation).

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 15e36198f85f..963611820006 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -584,7 +584,9 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 struct mlx5_ifc_flow_table_eswitch_cap_bits {
 	u8      reserved_at_0[0x1c];
 	u8      fdb_multi_path_to_table[0x1];
-	u8      reserved_at_1d[0x1e3];
+	u8      reserved_at_1d[0x1];
+	u8      multi_fdb_encap[0x1];
+	u8      reserved_at_1e[0x1e1];
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb;
 
-- 
cgit v1.2.3


From 328edb499f99126946845ece477c9c1afe8631af Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Tue, 3 Jul 2018 11:13:00 +0300
Subject: net/mlx5: Split FDB fast path prio to multiple namespaces

Towards supporting multi-chains and priorities, split the FDB fast path
to multiple namespaces (sub namespaces), each with multiple priorities.

This patch adds a new flow steering type, FS_TYPE_PRIO_CHAINS, which is
like current FS_TYPE_PRIO, but may contain only namespaces, and those
will be in parallel to one another in terms of managing of the flow
tables connections inside them. Meaning, while searching for the next
or previous flow table to connect for a new table inside such namespace
we skip the parallel namespaces in the same level under the
FS_TYPE_PRIO_CHAINS prio we originated from.

We use this new type for splitting the fast path prio into multiple
parallel namespaces, each containing normal prios.
The prios inside them (and their tables) will be connected to one
another, but not from one parallel namespace to another, instead the
last prio in each namespace will be connected to the next prio in
the containing FDB namespace, which is the slow path prio.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h |  7 ++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 88 ++++++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 13 ++++
 include/linux/mlx5/fs.h                           |  2 +
 5 files changed, 101 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 9c893d7d273e..d004957328f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -263,7 +263,7 @@ static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw)
 	esw_debug(dev, "Create FDB log_max_size(%d)\n",
 		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
 
-	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	root_ns = mlx5_get_fdb_sub_ns(dev, 0);
 	if (!root_ns) {
 		esw_warn(dev, "Failed to get FDB flow namespace\n");
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index c1b627577003..1698a322a7c4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -59,6 +59,9 @@
 #define mlx5_esw_has_fwd_fdb(dev) \
 	MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_to_table)
 
+#define FDB_MAX_CHAIN 3
+#define FDB_MAX_PRIO 16
+
 struct vport_ingress {
 	struct mlx5_flow_table *acl;
 	struct mlx5_flow_group *allow_untagged_spoofchk_grp;
@@ -319,6 +322,10 @@ static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
 static inline void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe) {}
 static inline int  mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode) { return 0; }
 static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) {}
+
+#define FDB_MAX_CHAIN 1
+#define FDB_MAX_PRIO 1
+
 #endif /* CONFIG_MLX5_ESWITCH */
 
 #endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index cdcbf9d0ae6c..7eb6d58733ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -40,6 +40,7 @@
 #include "diag/fs_tracepoint.h"
 #include "accel/ipsec.h"
 #include "fpga/ipsec.h"
+#include "eswitch.h"
 
 #define INIT_TREE_NODE_ARRAY_SIZE(...)	(sizeof((struct init_tree_node[]){__VA_ARGS__}) /\
 					 sizeof(struct init_tree_node))
@@ -713,7 +714,7 @@ static struct mlx5_flow_table *find_closest_ft_recursive(struct fs_node  *root,
 	struct fs_node *iter = list_entry(start, struct fs_node, list);
 	struct mlx5_flow_table *ft = NULL;
 
-	if (!root)
+	if (!root || root->type == FS_TYPE_PRIO_CHAINS)
 		return NULL;
 
 	list_for_each_advance_continue(iter, &root->children, reverse) {
@@ -1973,6 +1974,18 @@ void mlx5_destroy_flow_group(struct mlx5_flow_group *fg)
 			       fg->id);
 }
 
+struct mlx5_flow_namespace *mlx5_get_fdb_sub_ns(struct mlx5_core_dev *dev,
+						int n)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+
+	if (!steering || !steering->fdb_sub_ns)
+		return NULL;
+
+	return steering->fdb_sub_ns[n];
+}
+EXPORT_SYMBOL(mlx5_get_fdb_sub_ns);
+
 struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 						    enum mlx5_flow_namespace_type type)
 {
@@ -2051,8 +2064,10 @@ struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_d
 	}
 }
 
-static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
-				      unsigned int prio, int num_levels)
+static struct fs_prio *_fs_create_prio(struct mlx5_flow_namespace *ns,
+				       unsigned int prio,
+				       int num_levels,
+				       enum fs_node_type type)
 {
 	struct fs_prio *fs_prio;
 
@@ -2060,7 +2075,7 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
 	if (!fs_prio)
 		return ERR_PTR(-ENOMEM);
 
-	fs_prio->node.type = FS_TYPE_PRIO;
+	fs_prio->node.type = type;
 	tree_init_node(&fs_prio->node, NULL, del_sw_prio);
 	tree_add_node(&fs_prio->node, &ns->node);
 	fs_prio->num_levels = num_levels;
@@ -2070,6 +2085,19 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
 	return fs_prio;
 }
 
+static struct fs_prio *fs_create_prio_chained(struct mlx5_flow_namespace *ns,
+					      unsigned int prio,
+					      int num_levels)
+{
+	return _fs_create_prio(ns, prio, num_levels, FS_TYPE_PRIO_CHAINS);
+}
+
+static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
+				      unsigned int prio, int num_levels)
+{
+	return _fs_create_prio(ns, prio, num_levels, FS_TYPE_PRIO);
+}
+
 static struct mlx5_flow_namespace *fs_init_namespace(struct mlx5_flow_namespace
 						     *ns)
 {
@@ -2374,6 +2402,9 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
 	cleanup_egress_acls_root_ns(dev);
 	cleanup_ingress_acls_root_ns(dev);
 	cleanup_root_ns(steering->fdb_root_ns);
+	steering->fdb_root_ns = NULL;
+	kfree(steering->fdb_sub_ns);
+	steering->fdb_sub_ns = NULL;
 	cleanup_root_ns(steering->sniffer_rx_root_ns);
 	cleanup_root_ns(steering->sniffer_tx_root_ns);
 	cleanup_root_ns(steering->egress_root_ns);
@@ -2419,27 +2450,64 @@ static int init_sniffer_rx_root_ns(struct mlx5_flow_steering *steering)
 
 static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
 {
-	struct fs_prio *prio;
+	struct mlx5_flow_namespace *ns;
+	struct fs_prio *maj_prio;
+	struct fs_prio *min_prio;
+	int levels;
+	int chain;
+	int prio;
+	int err;
 
 	steering->fdb_root_ns = create_root_ns(steering, FS_FT_FDB);
 	if (!steering->fdb_root_ns)
 		return -ENOMEM;
 
-	prio = fs_create_prio(&steering->fdb_root_ns->ns, 0, 2);
-	if (IS_ERR(prio))
+	steering->fdb_sub_ns = kzalloc(sizeof(steering->fdb_sub_ns) *
+				       FDB_MAX_CHAIN + 1, GFP_KERNEL);
+	if (!steering->fdb_sub_ns)
+		return -ENOMEM;
+
+	levels = 2 * FDB_MAX_PRIO * (FDB_MAX_CHAIN + 1);
+	maj_prio = fs_create_prio_chained(&steering->fdb_root_ns->ns, 0,
+					  levels);
+	if (IS_ERR(maj_prio)) {
+		err = PTR_ERR(maj_prio);
 		goto out_err;
+	}
 
-	prio = fs_create_prio(&steering->fdb_root_ns->ns, 1, 1);
-	if (IS_ERR(prio))
+	for (chain = 0; chain <= FDB_MAX_CHAIN; chain++) {
+		ns = fs_create_namespace(maj_prio);
+		if (IS_ERR(ns)) {
+			err = PTR_ERR(ns);
+			goto out_err;
+		}
+
+		for (prio = 0; prio < FDB_MAX_PRIO * (chain + 1); prio++) {
+			min_prio = fs_create_prio(ns, prio, 2);
+			if (IS_ERR(min_prio)) {
+				err = PTR_ERR(min_prio);
+				goto out_err;
+			}
+		}
+
+		steering->fdb_sub_ns[chain] = ns;
+	}
+
+	maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, 1, 1);
+	if (IS_ERR(maj_prio)) {
+		err = PTR_ERR(maj_prio);
 		goto out_err;
+	}
 
 	set_prio_attrs(steering->fdb_root_ns);
 	return 0;
 
 out_err:
 	cleanup_root_ns(steering->fdb_root_ns);
+	kfree(steering->fdb_sub_ns);
+	steering->fdb_sub_ns = NULL;
 	steering->fdb_root_ns = NULL;
-	return PTR_ERR(prio);
+	return err;
 }
 
 static int init_egress_acl_root_ns(struct mlx5_flow_steering *steering, int vport)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index a06f83c0c2b6..b51ad217da32 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -38,9 +38,21 @@
 #include <linux/rhashtable.h>
 #include <linux/llist.h>
 
+/* FS_TYPE_PRIO_CHAINS is a PRIO that will have namespaces only,
+ * and those are in parallel to one another when going over them to connect
+ * a new flow table. Meaning the last flow table in a TYPE_PRIO prio in one
+ * parallel namespace will not automatically connect to the first flow table
+ * found in any prio in any next namespace, but skip the entire containing
+ * TYPE_PRIO_CHAINS prio.
+ *
+ * This is used to implement tc chains, each chain of prios is a different
+ * namespace inside a containing TYPE_PRIO_CHAINS prio.
+ */
+
 enum fs_node_type {
 	FS_TYPE_NAMESPACE,
 	FS_TYPE_PRIO,
+	FS_TYPE_PRIO_CHAINS,
 	FS_TYPE_FLOW_TABLE,
 	FS_TYPE_FLOW_GROUP,
 	FS_TYPE_FLOW_ENTRY,
@@ -73,6 +85,7 @@ struct mlx5_flow_steering {
 	struct kmem_cache               *ftes_cache;
 	struct mlx5_flow_root_namespace *root_ns;
 	struct mlx5_flow_root_namespace *fdb_root_ns;
+	struct mlx5_flow_namespace	**fdb_sub_ns;
 	struct mlx5_flow_root_namespace **esw_egress_root_ns;
 	struct mlx5_flow_root_namespace **esw_ingress_root_ns;
 	struct mlx5_flow_root_namespace	*sniffer_tx_root_ns;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index a5fc62184195..f8d00872c7d3 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -101,6 +101,8 @@ struct mlx5_flow_destination {
 	};
 };
 
+struct mlx5_flow_namespace *
+mlx5_get_fdb_sub_ns(struct mlx5_core_dev *dev, int n);
 struct mlx5_flow_namespace *
 mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 			enum mlx5_flow_namespace_type type);
-- 
cgit v1.2.3


From d5634fee245f9e92787e3a34ef621fc12b2cbf16 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Thu, 20 Sep 2018 12:17:48 +0200
Subject: net/mlx5: Add a no-append flow insertion mode

If no-append flag is set, we will add a new FTE, instead of appending
the actions of the inserted rule when the same match already exists.

While here, move the has_flow_tag boolean indicator to be a flag too.

This patch doesn't change any functionality.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanmox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c                    |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c      |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c    |  9 ++++++++-
 include/linux/mlx5/fs.h                              | 14 +++++++++++---
 5 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 5ced0cc46ba1..af32899bb72a 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2793,7 +2793,7 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
 			return -EINVAL;
 
 		action->flow_tag = ib_spec->flow_tag.tag_id;
-		action->has_flow_tag = true;
+		action->flags |= FLOW_ACT_HAS_TAG;
 		break;
 	case IB_FLOW_SPEC_ACTION_DROP:
 		if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
@@ -2886,7 +2886,7 @@ is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
 		return egress ? VALID_SPEC_INVALID : VALID_SPEC_NA;
 
 	return is_crypto && is_ipsec &&
-		(!egress || (!is_drop && !flow_act->has_flow_tag)) ?
+		(!egress || (!is_drop && !(flow_act->flags & FLOW_ACT_HAS_TAG))) ?
 		VALID_SPEC_VALID : VALID_SPEC_INVALID;
 }
 
@@ -3349,7 +3349,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 					MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
 	}
 
-	if (flow_act.has_flow_tag &&
+	if ((flow_act.flags & FLOW_ACT_HAS_TAG)  &&
 	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
 	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
 		mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 6c04e11f9a05..a9c68b7859b4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -684,9 +684,9 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 	struct mlx5_flow_destination dest[2] = {};
 	struct mlx5_flow_act flow_act = {
 		.action = attr->action,
-		.has_flow_tag = true,
 		.flow_tag = attr->flow_tag,
 		.reformat_id = 0,
+		.flags    = FLOW_ACT_HAS_TAG,
 	};
 	struct mlx5_fc *counter = NULL;
 	bool table_created = false;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
index 5645a4facad2..28aa8c968a80 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
@@ -650,7 +650,7 @@ static bool mlx5_is_fpga_egress_ipsec_rule(struct mlx5_core_dev *dev,
 	    (match_criteria_enable &
 	     ~(MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS)) ||
 	    (flow_act->action & ~(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | MLX5_FLOW_CONTEXT_ACTION_ALLOW)) ||
-	     flow_act->has_flow_tag)
+	     (flow_act->flags & FLOW_ACT_HAS_TAG))
 		return false;
 
 	return true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 7eb6d58733ac..67ba4c975d81 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1428,7 +1428,7 @@ static int check_conflicting_ftes(struct fs_fte *fte, const struct mlx5_flow_act
 		return -EEXIST;
 	}
 
-	if (flow_act->has_flow_tag &&
+	if ((flow_act->flags & FLOW_ACT_HAS_TAG) &&
 	    fte->action.flow_tag != flow_act->flow_tag) {
 		mlx5_core_warn(get_dev(&fte->node),
 			       "FTE flow tag %u already exists with different flow tag %u\n",
@@ -1628,6 +1628,8 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft,
 
 search_again_locked:
 	version = matched_fgs_get_version(match_head);
+	if (flow_act->flags & FLOW_ACT_NO_APPEND)
+		goto skip_search;
 	/* Try to find a fg that already contains a matching fte */
 	list_for_each_entry(iter, match_head, list) {
 		struct fs_fte *fte_tmp;
@@ -1644,6 +1646,11 @@ search_again_locked:
 		return rule;
 	}
 
+skip_search:
+	/* No group with matching fte found, or we skipped the search.
+	 * Try to add a new fte to any matching fg.
+	 */
+
 	/* Check the ft version, for case that new flow group
 	 * was added while the fgs weren't locked
 	 */
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index f8d00872c7d3..5660f07d3be0 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -158,20 +158,28 @@ struct mlx5_fs_vlan {
 
 #define MLX5_FS_VLAN_DEPTH	2
 
+enum {
+	FLOW_ACT_HAS_TAG   = BIT(0),
+	FLOW_ACT_NO_APPEND = BIT(1),
+};
+
 struct mlx5_flow_act {
 	u32 action;
-	bool has_flow_tag;
 	u32 flow_tag;
 	u32 reformat_id;
 	u32 modify_id;
 	uintptr_t esp_id;
+	u32 flags;
 	struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH];
 	struct ib_counters *counters;
 };
 
 #define MLX5_DECLARE_FLOW_ACT(name) \
-	struct mlx5_flow_act name = {MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\
-				     MLX5_FS_DEFAULT_FLOW_TAG, 0, 0}
+	struct mlx5_flow_act name = { .action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\
+				      .flow_tag = MLX5_FS_DEFAULT_FLOW_TAG, \
+				      .reformat_id = 0, \
+				      .modify_id = 0, \
+				      .flags =  0, }
 
 /* Single destination per rule.
  * Group ID is implied by the match criteria.
-- 
cgit v1.2.3


From cca45e054ce55c06046a37bf4d3fd7c17edd57da Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 17 Oct 2018 08:53:10 +0000
Subject: vxlan: Export address checking functions

Drivers that support VxLAN offload need to be able to sanitize the
configuration of the VxLAN device and accept / reject its offload.

For example, mlxsw requires that the local IP of the VxLAN device be set
and that packets be flooded to unicast IP(s) and not to a multicast
group.

Expose the functions that perform such checks.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 26 --------------------------
 include/net/vxlan.h | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 018406c4d944..0fc2b1d82d4c 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -103,22 +103,6 @@ bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
 		return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
 }
 
-static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
-{
-	if (ipa->sa.sa_family == AF_INET6)
-		return ipv6_addr_any(&ipa->sin6.sin6_addr);
-	else
-		return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
-}
-
-static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
-{
-	if (ipa->sa.sa_family == AF_INET6)
-		return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
-	else
-		return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
-}
-
 static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
 {
 	if (nla_len(nla) >= sizeof(struct in6_addr)) {
@@ -151,16 +135,6 @@ bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
 	return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
 }
 
-static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
-{
-	return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
-}
-
-static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
-{
-	return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
-}
-
 static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
 {
 	if (nla_len(nla) >= sizeof(struct in6_addr)) {
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 7ef15179f263..dd3d72ce64b6 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -370,4 +370,36 @@ static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs)
 	return vs->sock->sk->sk_family;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
+
+static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
+{
+	if (ipa->sa.sa_family == AF_INET6)
+		return ipv6_addr_any(&ipa->sin6.sin6_addr);
+	else
+		return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
+}
+
+static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
+{
+	if (ipa->sa.sa_family == AF_INET6)
+		return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
+	else
+		return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
+}
+
+#else /* !IS_ENABLED(CONFIG_IPV6) */
+
+static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
+{
+	return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
+}
+
+static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
+{
+	return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
+}
+
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
 #endif
-- 
cgit v1.2.3


From 28e450333d4d1328710e258d38793c61658d4c95 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 17 Oct 2018 08:53:12 +0000
Subject: inet: Refactor INET_ECN_decapsulate()

Drivers that support tunnel decapsulation (IPinIP or NVE) need to
configure the underlying device to conform to the behavior outlined in
RFC 6040 with respect to the ECN bits.

This behavior is implemented by INET_ECN_decapsulate() which requires an
skb to be passed where the ECN CE bit can be potentially set. Since
these drivers do not need to mark an skb, but only configure the device
to do so, factor out the business logic to __INET_ECN_decapsulate() and
potentially perform the marking in INET_ECN_decapsulate().

This allows drivers to invoke __INET_ECN_decapsulate() and configure the
device.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Suggested-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_ecn.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index 482a1b705362..c8e2bebd8d93 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -183,8 +183,7 @@ static inline int INET_ECN_set_ce(struct sk_buff *skb)
  *          1 if something is broken and should be logged (!!! above)
  *          2 if packet should be dropped
  */
-static inline int INET_ECN_decapsulate(struct sk_buff *skb,
-				       __u8 outer, __u8 inner)
+static inline int __INET_ECN_decapsulate(__u8 outer, __u8 inner, bool *set_ce)
 {
 	if (INET_ECN_is_not_ect(inner)) {
 		switch (outer & INET_ECN_MASK) {
@@ -198,10 +197,21 @@ static inline int INET_ECN_decapsulate(struct sk_buff *skb,
 		}
 	}
 
-	if (INET_ECN_is_ce(outer))
+	*set_ce = INET_ECN_is_ce(outer);
+	return 0;
+}
+
+static inline int INET_ECN_decapsulate(struct sk_buff *skb,
+				       __u8 outer, __u8 inner)
+{
+	bool set_ce = false;
+	int rc;
+
+	rc = __INET_ECN_decapsulate(outer, inner, &set_ce);
+	if (!rc && set_ce)
 		INET_ECN_set_ce(skb);
 
-	return 0;
+	return rc;
 }
 
 static inline int IP_ECN_decapsulate(const struct iphdr *oiph,
-- 
cgit v1.2.3


From 5ff4ff4fe8c4e7d0de1d837e489056f0c470667b Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 17 Oct 2018 08:53:20 +0000
Subject: net: Add netif_is_vxlan()

Add the ability to determine whether a netdev is a VxLAN netdev by
calling the above mentioned function that checks the netdev's
rtnl_link_ops.

This will allow modules to identify netdev events involving a VxLAN
netdev and act accordingly. For example, drivers capable of VxLAN
offload will need to configure the underlying device when a VxLAN netdev
is being enslaved to an offloaded bridge.

Convert nfp to use the newly introduced helper.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c | 3 ++-
 include/net/vxlan.h                                     | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
index 30c926c4bc47..8e5bec04d1f9 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
@@ -4,6 +4,7 @@
 #include <linux/etherdevice.h>
 #include <linux/inetdevice.h>
 #include <net/netevent.h>
+#include <net/vxlan.h>
 #include <linux/idr.h>
 #include <net/dst_metadata.h>
 #include <net/arp.h>
@@ -187,7 +188,7 @@ static bool nfp_tun_is_netdev_to_offload(struct net_device *netdev)
 		return false;
 	if (!strcmp(netdev->rtnl_link_ops->kind, "openvswitch"))
 		return true;
-	if (!strcmp(netdev->rtnl_link_ops->kind, "vxlan"))
+	if (netif_is_vxlan(netdev))
 		return true;
 
 	return false;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index dd3d72ce64b6..95227fa925e8 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -5,6 +5,7 @@
 #include <linux/if_vlan.h>
 #include <net/udp_tunnel.h>
 #include <net/dst_metadata.h>
+#include <net/rtnetlink.h>
 
 /* VXLAN protocol (RFC 7348) header:
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@@ -402,4 +403,10 @@ static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
 
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
+static inline bool netif_is_vxlan(const struct net_device *dev)
+{
+	return dev->rtnl_link_ops &&
+	       !strcmp(dev->rtnl_link_ops->kind, "vxlan");
+}
+
 #endif
-- 
cgit v1.2.3


From 9a99735317866e821c75f957fc85c63d049d330c Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 17 Oct 2018 08:53:22 +0000
Subject: vxlan: Add switchdev notifications

When offloading VXLAN devices, drivers need to know about events in
VXLAN FDB database. Since VXLAN models a bridge, it is natural to
distribute the VXLAN FDB notifications using the pre-existing switchdev
notification mechanism.

To that end, introduce two new notification types:
SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE and SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE.
Introduce a new function, vxlan_fdb_switchdev_call_notifiers() to send
the new notifier types, and a struct switchdev_notifier_vxlan_fdb_info
to communicate the details of the FDB entry under consideration.

Invoke the new function from vxlan_fdb_notify().

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c     | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 include/net/switchdev.h |  3 +++
 include/net/vxlan.h     | 11 +++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 0fc2b1d82d4c..de5caa2f6aac 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -327,8 +327,8 @@ static inline size_t vxlan_nlmsg_size(void)
 		+ nla_total_size(sizeof(struct nda_cacheinfo));
 }
 
-static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
-			     struct vxlan_rdst *rd, int type)
+static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
+			       struct vxlan_rdst *rd, int type)
 {
 	struct net *net = dev_net(vxlan->dev);
 	struct sk_buff *skb;
@@ -353,6 +353,48 @@ errout:
 		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
 }
 
+static void vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan,
+					       struct vxlan_fdb *fdb,
+					       struct vxlan_rdst *rd,
+					       bool adding)
+{
+	struct switchdev_notifier_vxlan_fdb_info info;
+	enum switchdev_notifier_type notifier_type;
+
+	if (WARN_ON(!rd))
+		return;
+
+	notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
+			       : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE;
+
+	info = (struct switchdev_notifier_vxlan_fdb_info){
+		.remote_ip = rd->remote_ip,
+		.remote_port = rd->remote_port,
+		.remote_vni = rd->remote_vni,
+		.remote_ifindex = rd->remote_ifindex,
+		.vni = fdb->vni,
+	};
+	memcpy(info.eth_addr, fdb->eth_addr, ETH_ALEN);
+
+	call_switchdev_notifiers(notifier_type, vxlan->dev,
+				 &info.info);
+}
+
+static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
+			     struct vxlan_rdst *rd, int type)
+{
+	switch (type) {
+	case RTM_NEWNEIGH:
+		vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, true);
+		break;
+	case RTM_DELNEIGH:
+		vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, false);
+		break;
+	}
+
+	__vxlan_fdb_notify(vxlan, fdb, rd, type);
+}
+
 static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d574ce63bf22..47199a11c586 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -145,6 +145,9 @@ enum switchdev_notifier_type {
 	SWITCHDEV_FDB_ADD_TO_DEVICE,
 	SWITCHDEV_FDB_DEL_TO_DEVICE,
 	SWITCHDEV_FDB_OFFLOADED,
+
+	SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
+	SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE,
 };
 
 struct switchdev_notifier_info {
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 95227fa925e8..3f00877f5edf 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -6,6 +6,7 @@
 #include <net/udp_tunnel.h>
 #include <net/dst_metadata.h>
 #include <net/rtnetlink.h>
+#include <net/switchdev.h>
 
 /* VXLAN protocol (RFC 7348) header:
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@@ -409,4 +410,14 @@ static inline bool netif_is_vxlan(const struct net_device *dev)
 	       !strcmp(dev->rtnl_link_ops->kind, "vxlan");
 }
 
+struct switchdev_notifier_vxlan_fdb_info {
+	struct switchdev_notifier_info info; /* must be first */
+	union vxlan_addr remote_ip;
+	__be16 remote_port;
+	__be32 remote_vni;
+	u32 remote_ifindex;
+	u8 eth_addr[ETH_ALEN];
+	__be32 vni;
+};
+
 #endif
-- 
cgit v1.2.3


From 1941f1d6453a527ae8df59891da0319646608444 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 17 Oct 2018 08:53:24 +0000
Subject: vxlan: Add vxlan_fdb_find_uc() for FDB querying

A switchdev-capable driver that is aware of VXLAN may need to query
VXLAN FDB. In the particular case of mlxsw, this functionality is
limited to querying UC FDBs. Those being easier to deal with than the
general case of RDST chain traversal, introduce an interface to query
specifically UC FDBs: vxlan_fdb_find_uc().

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 40 ++++++++++++++++++++++++++++++++++++++++
 include/net/vxlan.h | 12 ++++++++++++
 2 files changed, 52 insertions(+)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index de5caa2f6aac..410eee23c50c 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -504,6 +504,46 @@ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
 	return NULL;
 }
 
+int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
+		      struct switchdev_notifier_vxlan_fdb_info *fdb_info)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	u8 eth_addr[ETH_ALEN + 2] = { 0 };
+	struct vxlan_rdst *rdst;
+	struct vxlan_fdb *f;
+	int rc = 0;
+
+	if (is_multicast_ether_addr(mac) ||
+	    is_zero_ether_addr(mac))
+		return -EINVAL;
+
+	ether_addr_copy(eth_addr, mac);
+
+	rcu_read_lock();
+
+	f = __vxlan_find_mac(vxlan, eth_addr, vni);
+	if (!f) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	rdst = first_remote_rcu(f);
+
+	memset(fdb_info, 0, sizeof(*fdb_info));
+	fdb_info->info.dev = dev;
+	fdb_info->remote_ip = rdst->remote_ip;
+	fdb_info->remote_port = rdst->remote_port;
+	fdb_info->remote_vni = rdst->remote_vni;
+	fdb_info->remote_ifindex = rdst->remote_ifindex;
+	fdb_info->vni = vni;
+	ether_addr_copy(fdb_info->eth_addr, mac);
+
+out:
+	rcu_read_unlock();
+	return rc;
+}
+EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc);
+
 /* Replace destination of unicast mac */
 static int vxlan_fdb_replace(struct vxlan_fdb *f,
 			     union vxlan_addr *ip, __be16 port, __be32 vni,
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 3f00877f5edf..1828d686ac4f 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -420,4 +420,16 @@ struct switchdev_notifier_vxlan_fdb_info {
 	__be32 vni;
 };
 
+#if IS_ENABLED(CONFIG_VXLAN)
+int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
+		      struct switchdev_notifier_vxlan_fdb_info *fdb_info);
+#else
+static inline int
+vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
+		  struct switchdev_notifier_vxlan_fdb_info *fdb_info)
+{
+	return -ENOENT;
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 0efe11733356273d734cc2c5ab2dc6f5865cbeb6 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 17 Oct 2018 08:53:26 +0000
Subject: vxlan: Support marking RDSTs as offloaded

Offloaded bridge FDB entries are marked with NTF_OFFLOADED. Implement a
similar mechanism for VXLAN, where a given remote destination can be
marked as offloaded.

To that end, introduce a new event, SWITCHDEV_VXLAN_FDB_OFFLOADED,
through which the marking is communicated to the vxlan driver. To
identify which RDST should be marked as offloaded, an
switchdev_notifier_vxlan_fdb_info is passed to the listeners. The
"offloaded" flag in that object determines whether the offloaded mark
should be set or cleared.

When sending offloaded FDB entries over netlink, mark them with
NTF_OFFLOADED.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c     | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/net/switchdev.h |  1 +
 include/net/vxlan.h     |  2 ++
 3 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 410eee23c50c..e98fc54379f8 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -272,6 +272,8 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 	ndm->ndm_state = fdb->state;
 	ndm->ndm_ifindex = vxlan->dev->ifindex;
 	ndm->ndm_flags = fdb->flags;
+	if (rdst->offloaded)
+		ndm->ndm_flags |= NTF_OFFLOADED;
 	ndm->ndm_type = RTN_UNICAST;
 
 	if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
@@ -373,6 +375,7 @@ static void vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan,
 		.remote_vni = rd->remote_vni,
 		.remote_ifindex = rd->remote_ifindex,
 		.vni = fdb->vni,
+		.offloaded = rd->offloaded,
 	};
 	memcpy(info.eth_addr, fdb->eth_addr, ETH_ALEN);
 
@@ -536,6 +539,7 @@ int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
 	fdb_info->remote_vni = rdst->remote_vni;
 	fdb_info->remote_ifindex = rdst->remote_ifindex;
 	fdb_info->vni = vni;
+	fdb_info->offloaded = rdst->offloaded;
 	ether_addr_copy(fdb_info->eth_addr, mac);
 
 out:
@@ -589,6 +593,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 
 	rd->remote_ip = *ip;
 	rd->remote_port = port;
+	rd->offloaded = false;
 	rd->remote_vni = vni;
 	rd->remote_ifindex = ifindex;
 
@@ -3817,6 +3822,51 @@ static struct notifier_block vxlan_notifier_block __read_mostly = {
 	.notifier_call = vxlan_netdevice_event,
 };
 
+static void
+vxlan_fdb_offloaded_set(struct net_device *dev,
+			struct switchdev_notifier_vxlan_fdb_info *fdb_info)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_rdst *rdst;
+	struct vxlan_fdb *f;
+
+	spin_lock_bh(&vxlan->hash_lock);
+
+	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
+	if (!f)
+		goto out;
+
+	rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip,
+				   fdb_info->remote_port,
+				   fdb_info->remote_vni,
+				   fdb_info->remote_ifindex);
+	if (!rdst)
+		goto out;
+
+	rdst->offloaded = fdb_info->offloaded;
+
+out:
+	spin_unlock_bh(&vxlan->hash_lock);
+}
+
+static int vxlan_switchdev_event(struct notifier_block *unused,
+				 unsigned long event, void *ptr)
+{
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case SWITCHDEV_VXLAN_FDB_OFFLOADED:
+		vxlan_fdb_offloaded_set(dev, ptr);
+		break;
+	}
+
+	return 0;
+}
+
+static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = {
+	.notifier_call = vxlan_switchdev_event,
+};
+
 static __net_init int vxlan_init_net(struct net *net)
 {
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
@@ -3890,11 +3940,17 @@ static int __init vxlan_init_module(void)
 	if (rc)
 		goto out2;
 
-	rc = rtnl_link_register(&vxlan_link_ops);
+	rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block);
 	if (rc)
 		goto out3;
 
+	rc = rtnl_link_register(&vxlan_link_ops);
+	if (rc)
+		goto out4;
+
 	return 0;
+out4:
+	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
 out3:
 	unregister_netdevice_notifier(&vxlan_notifier_block);
 out2:
@@ -3907,6 +3963,7 @@ late_initcall(vxlan_init_module);
 static void __exit vxlan_cleanup_module(void)
 {
 	rtnl_link_unregister(&vxlan_link_ops);
+	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
 	unregister_netdevice_notifier(&vxlan_notifier_block);
 	unregister_pernet_subsys(&vxlan_net_ops);
 	/* rcu_barrier() is called by netns */
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 47199a11c586..b040f82351ba 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -148,6 +148,7 @@ enum switchdev_notifier_type {
 
 	SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
 	SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE,
+	SWITCHDEV_VXLAN_FDB_OFFLOADED,
 };
 
 struct switchdev_notifier_info {
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 1828d686ac4f..03431c148e16 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -192,6 +192,7 @@ union vxlan_addr {
 struct vxlan_rdst {
 	union vxlan_addr	 remote_ip;
 	__be16			 remote_port;
+	u8			 offloaded:1;
 	__be32			 remote_vni;
 	u32			 remote_ifindex;
 	struct list_head	 list;
@@ -418,6 +419,7 @@ struct switchdev_notifier_vxlan_fdb_info {
 	u32 remote_ifindex;
 	u8 eth_addr[ETH_ALEN];
 	__be32 vni;
+	bool offloaded;
 };
 
 #if IS_ENABLED(CONFIG_VXLAN)
-- 
cgit v1.2.3


From e9ba0fbc7dd23a74e77960c98c988f59a1ff75aa Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 17 Oct 2018 08:53:29 +0000
Subject: bridge: switchdev: Allow clearing FDB entry offload indication

Currently, an FDB entry only ceases being offloaded when it is deleted.
This changes with VxLAN encapsulation.

Devices capable of performing VxLAN encapsulation usually have only one
FDB table, unlike the software data path which has two - one in the
bridge driver and another in the VxLAN driver.

Therefore, bridge FDB entries pointing to a VxLAN device are only
offloaded if there is a corresponding entry in the VxLAN FDB.

Allow clearing the offload indication in case the corresponding entry
was deleted from the VxLAN FDB.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 9 +++++----
 drivers/net/ethernet/rocker/rocker_main.c                | 1 +
 include/net/switchdev.h                                  | 3 ++-
 net/bridge/br.c                                          | 4 ++--
 net/bridge/br_fdb.c                                      | 4 ++--
 net/bridge/br_private.h                                  | 2 +-
 net/bridge/br_switchdev.c                                | 9 ++++++---
 net/dsa/slave.c                                          | 1 +
 8 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index fa16ad2c6a50..a89075beef94 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -2090,12 +2090,13 @@ void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port,
 static void
 mlxsw_sp_fdb_call_notifiers(enum switchdev_notifier_type type,
 			    const char *mac, u16 vid,
-			    struct net_device *dev)
+			    struct net_device *dev, bool offloaded)
 {
 	struct switchdev_notifier_fdb_info info;
 
 	info.addr = mac;
 	info.vid = vid;
+	info.offloaded = offloaded;
 	call_switchdev_notifiers(type, dev, &info.info);
 }
 
@@ -2147,7 +2148,7 @@ do_fdb_op:
 	if (!do_notification)
 		return;
 	type = adding ? SWITCHDEV_FDB_ADD_TO_BRIDGE : SWITCHDEV_FDB_DEL_TO_BRIDGE;
-	mlxsw_sp_fdb_call_notifiers(type, mac, vid, bridge_port->dev);
+	mlxsw_sp_fdb_call_notifiers(type, mac, vid, bridge_port->dev, adding);
 
 	return;
 
@@ -2207,7 +2208,7 @@ do_fdb_op:
 	if (!do_notification)
 		return;
 	type = adding ? SWITCHDEV_FDB_ADD_TO_BRIDGE : SWITCHDEV_FDB_DEL_TO_BRIDGE;
-	mlxsw_sp_fdb_call_notifiers(type, mac, vid, bridge_port->dev);
+	mlxsw_sp_fdb_call_notifiers(type, mac, vid, bridge_port->dev, adding);
 
 	return;
 
@@ -2312,7 +2313,7 @@ static void mlxsw_sp_switchdev_bridge_fdb_event_work(struct work_struct *work)
 			break;
 		mlxsw_sp_fdb_call_notifiers(SWITCHDEV_FDB_OFFLOADED,
 					    fdb_info->addr,
-					    fdb_info->vid, dev);
+					    fdb_info->vid, dev, true);
 		break;
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
 		fdb_info = &switchdev_work->fdb_info;
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index aeafdb9ac015..8721c0506af3 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2728,6 +2728,7 @@ rocker_fdb_offload_notify(struct rocker_port *rocker_port,
 
 	info.addr = recv_info->addr;
 	info.vid = recv_info->vid;
+	info.offloaded = true;
 	call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED,
 				 rocker_port->dev, &info.info);
 }
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index b040f82351ba..881ecb1555bf 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -159,7 +159,8 @@ struct switchdev_notifier_fdb_info {
 	struct switchdev_notifier_info info; /* must be first */
 	const unsigned char *addr;
 	u16 vid;
-	bool added_by_user;
+	u8 added_by_user:1,
+	   offloaded:1;
 };
 
 static inline struct net_device *
diff --git a/net/bridge/br.c b/net/bridge/br.c
index e411e40333e2..360ad66c21e9 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -151,7 +151,7 @@ static int br_switchdev_event(struct notifier_block *unused,
 			break;
 		}
 		br_fdb_offloaded_set(br, p, fdb_info->addr,
-				     fdb_info->vid);
+				     fdb_info->vid, true);
 		break;
 	case SWITCHDEV_FDB_DEL_TO_BRIDGE:
 		fdb_info = ptr;
@@ -163,7 +163,7 @@ static int br_switchdev_event(struct notifier_block *unused,
 	case SWITCHDEV_FDB_OFFLOADED:
 		fdb_info = ptr;
 		br_fdb_offloaded_set(br, p, fdb_info->addr,
-				     fdb_info->vid);
+				     fdb_info->vid, fdb_info->offloaded);
 		break;
 	}
 
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 74331690a390..e56ba3912a90 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -1152,7 +1152,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 }
 
 void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
-			  const unsigned char *addr, u16 vid)
+			  const unsigned char *addr, u16 vid, bool offloaded)
 {
 	struct net_bridge_fdb_entry *fdb;
 
@@ -1160,7 +1160,7 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
 
 	fdb = br_fdb_find(br, addr, vid);
 	if (fdb)
-		fdb->offloaded = 1;
+		fdb->offloaded = offloaded;
 
 	spin_unlock_bh(&br->hash_lock);
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 10ee39fdca5c..2920e06a5403 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -574,7 +574,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid,
 			      bool swdev_notify);
 void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
-			  const unsigned char *addr, u16 vid);
+			  const unsigned char *addr, u16 vid, bool offloaded);
 
 /* br_forward.c */
 enum br_pkt_type {
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index d77f807420c4..b993df770675 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -103,7 +103,7 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
 static void
 br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
 				u16 vid, struct net_device *dev,
-				bool added_by_user)
+				bool added_by_user, bool offloaded)
 {
 	struct switchdev_notifier_fdb_info info;
 	unsigned long notifier_type;
@@ -111,6 +111,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
 	info.addr = mac;
 	info.vid = vid;
 	info.added_by_user = added_by_user;
+	info.offloaded = offloaded;
 	notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE;
 	call_switchdev_notifiers(notifier_type, dev, &info.info);
 }
@@ -126,13 +127,15 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
 		br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
 						fdb->key.vlan_id,
 						fdb->dst->dev,
-						fdb->added_by_user);
+						fdb->added_by_user,
+						fdb->offloaded);
 		break;
 	case RTM_NEWNEIGH:
 		br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
 						fdb->key.vlan_id,
 						fdb->dst->dev,
-						fdb->added_by_user);
+						fdb->added_by_user,
+						fdb->offloaded);
 		break;
 	}
 }
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 3f840b6eea69..5428ef529019 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1478,6 +1478,7 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work)
 			netdev_dbg(dev, "fdb add failed err=%d\n", err);
 			break;
 		}
+		fdb_info->offloaded = true;
 		call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev,
 					 &fdb_info->info);
 		break;
-- 
cgit v1.2.3


From b55cbc8d9b44aaee94f19e995a5f241d453763ee Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 17 Oct 2018 16:24:48 +0200
Subject: bpf: fix doc of bpf_skb_adjust_room() in uapi

len_diff is signed.

Fixes: fa15601ab31e ("bpf: add documentation for eBPF helpers (33-41)")
CC: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 2 +-
 tools/include/uapi/linux/bpf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f9187b41dff6..5e46f6732781 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1433,7 +1433,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
+ * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags)
  * 	Description
  * 		Grow or shrink the room for data in the packet associated to
  * 		*skb* by *len_diff*, and according to the selected *mode*.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f9187b41dff6..5e46f6732781 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1433,7 +1433,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
+ * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags)
  * 	Description
  * 		Grow or shrink the room for data in the packet associated to
  * 		*skb* by *len_diff*, and according to the selected *mode*.
-- 
cgit v1.2.3


From 82385b0d2d2504aee51aa3fb40ebfb03603f64c3 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 17 Oct 2018 15:01:37 +0200
Subject: net: skbuff.h: Mark expected switch fall-throughs

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 119d092c6b13..0ba687454267 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3505,13 +3505,19 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
 #define __it(x, op) (x -= sizeof(u##op))
 #define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
 	case 32: diffs |= __it_diff(a, b, 64);
+		 /* fall through */
 	case 24: diffs |= __it_diff(a, b, 64);
+		 /* fall through */
 	case 16: diffs |= __it_diff(a, b, 64);
+		 /* fall through */
 	case  8: diffs |= __it_diff(a, b, 64);
 		break;
 	case 28: diffs |= __it_diff(a, b, 64);
+		 /* fall through */
 	case 20: diffs |= __it_diff(a, b, 64);
+		 /* fall through */
 	case 12: diffs |= __it_diff(a, b, 64);
+		 /* fall through */
 	case  4: diffs |= __it_diff(a, b, 32);
 		break;
 	}
-- 
cgit v1.2.3


From 605c0ac182c34867bda71bfbcc74958aabbe2fe0 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 17 Oct 2018 03:07:50 +0800
Subject: sctp: count both sk and asoc sndbuf with skb truesize and sctp_chunk
 size

Now it's confusing that asoc sndbuf_used is doing memory accounting with
SCTP_DATA_SNDSIZE(chunk) + sizeof(sk_buff) + sizeof(sctp_chunk) while sk
sk_wmem_alloc is doing that with skb->truesize + sizeof(sctp_chunk).

It also causes sctp_prsctp_prune to count with a wrong freed memory when
sndbuf_policy is not set.

To make this right and also keep consistent between asoc sndbuf_used, sk
sk_wmem_alloc and sk_wmem_queued, use skb->truesize + sizeof(sctp_chunk)
for them.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h |  5 -----
 net/sctp/outqueue.c          |  8 ++------
 net/sctp/socket.c            | 21 ++++++---------------
 3 files changed, 8 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 86f034b524d4..8dadc74c22e7 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -148,11 +148,6 @@ SCTP_SUBTYPE_CONSTRUCTOR(PRIMITIVE,	enum sctp_event_primitive, primitive)
 #define sctp_chunk_is_data(a) (a->chunk_hdr->type == SCTP_CID_DATA || \
 			       a->chunk_hdr->type == SCTP_CID_I_DATA)
 
-/* Calculate the actual data size in a data chunk */
-#define SCTP_DATA_SNDSIZE(c) ((int)((unsigned long)(c->chunk_end) - \
-				    (unsigned long)(c->chunk_hdr) - \
-				    sctp_datachk_len(&c->asoc->stream)))
-
 /* Internal error codes */
 enum sctp_ierror {
 	SCTP_IERROR_NO_ERROR	        = 0,
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 42191ed9902b..9cb854b05342 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -385,9 +385,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
 			asoc->outqueue.outstanding_bytes -= sctp_data_size(chk);
 		}
 
-		msg_len -= SCTP_DATA_SNDSIZE(chk) +
-			   sizeof(struct sk_buff) +
-			   sizeof(struct sctp_chunk);
+		msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
 		if (msg_len <= 0)
 			break;
 	}
@@ -421,9 +419,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 			streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
 		}
 
-		msg_len -= SCTP_DATA_SNDSIZE(chk) +
-			   sizeof(struct sk_buff) +
-			   sizeof(struct sctp_chunk);
+		msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
 		sctp_chunk_free(chk);
 		if (msg_len <= 0)
 			break;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f73e9d38d5ba..c6f29505c34d 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -166,12 +166,9 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk)
 	/* Save the chunk pointer in skb for sctp_wfree to use later.  */
 	skb_shinfo(chunk->skb)->destructor_arg = chunk;
 
-	asoc->sndbuf_used += SCTP_DATA_SNDSIZE(chunk) +
-				sizeof(struct sk_buff) +
-				sizeof(struct sctp_chunk);
-
 	refcount_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
-	sk->sk_wmem_queued += chunk->skb->truesize;
+	asoc->sndbuf_used += chunk->skb->truesize + sizeof(struct sctp_chunk);
+	sk->sk_wmem_queued += chunk->skb->truesize + sizeof(struct sctp_chunk);
 	sk_mem_charge(sk, chunk->skb->truesize);
 }
 
@@ -8460,17 +8457,11 @@ static void sctp_wfree(struct sk_buff *skb)
 	struct sctp_association *asoc = chunk->asoc;
 	struct sock *sk = asoc->base.sk;
 
-	asoc->sndbuf_used -= SCTP_DATA_SNDSIZE(chunk) +
-				sizeof(struct sk_buff) +
-				sizeof(struct sctp_chunk);
-
-	WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc));
-
-	/*
-	 * This undoes what is done via sctp_set_owner_w and sk_mem_charge
-	 */
-	sk->sk_wmem_queued   -= skb->truesize;
 	sk_mem_uncharge(sk, skb->truesize);
+	sk->sk_wmem_queued -= skb->truesize + sizeof(struct sctp_chunk);
+	asoc->sndbuf_used -= skb->truesize + sizeof(struct sctp_chunk);
+	WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk),
+				      &sk->sk_wmem_alloc));
 
 	if (chunk->shkey) {
 		struct sctp_shared_key *shkey = chunk->shkey;
-- 
cgit v1.2.3


From 4972e6fa3a04032830bc3d6bb343d08ab3546773 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 12 Sep 2018 15:36:41 +0300
Subject: net/mlx5: Refactor fragmented buffer struct fields and init flow

Take struct mlx5_frag_buf out of mlx5_frag_buf_ctrl, as it is not
needed to manage and control the datapath of the fragmented buffers API.

struct mlx5_frag_buf contains control info to manage the allocation
and de-allocation of the fragmented buffer.
Its fields are not relevant for datapath, so here I take them out of the
struct mlx5_frag_buf_ctrl, except for the fragments array itself.

In addition, modified mlx5_fill_fbc to initialise the frags pointers
as well. This implies that the buffer must be allocated before the
function is called.

A set of type-specific *_get_byte_size() functions are replaced by
a generic one.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c              |  31 +++----
 drivers/infiniband/hw/mlx5/mlx5_ib.h         |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/wq.c | 120 +++++++++++----------------
 include/linux/mlx5/driver.h                  |  22 ++---
 4 files changed, 69 insertions(+), 105 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 088205d7f1a1..cca1820802b8 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -393,7 +393,7 @@ static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
 
 static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf)
 {
-	mlx5_frag_buf_free(dev->mdev, &buf->fbc.frag_buf);
+	mlx5_frag_buf_free(dev->mdev, &buf->frag_buf);
 }
 
 static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
@@ -728,16 +728,11 @@ static int alloc_cq_frag_buf(struct mlx5_ib_dev *dev,
 			     int nent,
 			     int cqe_size)
 {
-	struct mlx5_frag_buf_ctrl *c = &buf->fbc;
-	struct mlx5_frag_buf *frag_buf = &c->frag_buf;
-	u32 cqc_buff[MLX5_ST_SZ_DW(cqc)] = {0};
+	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
+	u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
+	u8 log_wq_sz     = ilog2(cqe_size);
 	int err;
 
-	MLX5_SET(cqc, cqc_buff, log_cq_size, ilog2(cqe_size));
-	MLX5_SET(cqc, cqc_buff, cqe_sz, (cqe_size == 128) ? 1 : 0);
-
-	mlx5_core_init_cq_frag_buf(&buf->fbc, cqc_buff);
-
 	err = mlx5_frag_buf_alloc_node(dev->mdev,
 				       nent * cqe_size,
 				       frag_buf,
@@ -745,6 +740,8 @@ static int alloc_cq_frag_buf(struct mlx5_ib_dev *dev,
 	if (err)
 		return err;
 
+	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
+
 	buf->cqe_size = cqe_size;
 	buf->nent = nent;
 
@@ -934,7 +931,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
 
 	*inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
 		 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
-		 cq->buf.fbc.frag_buf.npages;
+		 cq->buf.frag_buf.npages;
 	*cqb = kvzalloc(*inlen, GFP_KERNEL);
 	if (!*cqb) {
 		err = -ENOMEM;
@@ -942,11 +939,11 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
 	}
 
 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas);
-	mlx5_fill_page_frag_array(&cq->buf.fbc.frag_buf, pas);
+	mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
 
 	cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context);
 	MLX5_SET(cqc, cqc, log_page_size,
-		 cq->buf.fbc.frag_buf.page_shift -
+		 cq->buf.frag_buf.page_shift -
 		 MLX5_ADAPTER_PAGE_SHIFT);
 
 	*index = dev->mdev->priv.uar->index;
@@ -1365,11 +1362,10 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 		cqe_size = 64;
 		err = resize_kernel(dev, cq, entries, cqe_size);
 		if (!err) {
-			struct mlx5_frag_buf_ctrl *c;
+			struct mlx5_frag_buf *frag_buf = &cq->resize_buf->frag_buf;
 
-			c = &cq->resize_buf->fbc;
-			npas = c->frag_buf.npages;
-			page_shift = c->frag_buf.page_shift;
+			npas = frag_buf->npages;
+			page_shift = frag_buf->page_shift;
 		}
 	}
 
@@ -1390,8 +1386,7 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 		mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift,
 				     pas, 0);
 	else
-		mlx5_fill_page_frag_array(&cq->resize_buf->fbc.frag_buf,
-					  pas);
+		mlx5_fill_page_frag_array(&cq->resize_buf->frag_buf, pas);
 
 	MLX5_SET(modify_cq_in, in,
 		 modify_field_select_resize_field_select.resize_field_select.resize_field_select,
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 320d4dfe8c2f..289c18db2611 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -435,6 +435,7 @@ struct mlx5_ib_qp {
 
 struct mlx5_ib_cq_buf {
 	struct mlx5_frag_buf_ctrl fbc;
+	struct mlx5_frag_buf    frag_buf;
 	struct ib_umem		*umem;
 	int			cqe_size;
 	int			nent;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index 68e7f8df2a6d..9007e91ad53f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -54,54 +54,37 @@ u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq)
 	return (u32)wq->fbc.sz_m1 + 1;
 }
 
-static u32 mlx5_wq_cyc_get_byte_size(struct mlx5_wq_cyc *wq)
+static u32 wq_get_byte_sz(u8 log_sz, u8 log_stride)
 {
-	return mlx5_wq_cyc_get_size(wq) << wq->fbc.log_stride;
-}
-
-static u32 mlx5_wq_qp_get_byte_size(struct mlx5_wq_qp *wq)
-{
-	return mlx5_wq_cyc_get_byte_size(&wq->rq) +
-	       mlx5_wq_cyc_get_byte_size(&wq->sq);
-}
-
-static u32 mlx5_cqwq_get_byte_size(struct mlx5_cqwq *wq)
-{
-	return mlx5_cqwq_get_size(wq) << wq->fbc.log_stride;
-}
-
-static u32 mlx5_wq_ll_get_byte_size(struct mlx5_wq_ll *wq)
-{
-	return mlx5_wq_ll_get_size(wq) << wq->fbc.log_stride;
+	return ((u32)1 << log_sz) << log_stride;
 }
 
 int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		       void *wqc, struct mlx5_wq_cyc *wq,
 		       struct mlx5_wq_ctrl *wq_ctrl)
 {
+	u8 log_wq_stride = MLX5_GET(wq, wqc, log_wq_stride);
+	u8 log_wq_sz     = MLX5_GET(wq, wqc, log_wq_sz);
 	struct mlx5_frag_buf_ctrl *fbc = &wq->fbc;
 	int err;
 
-	mlx5_fill_fbc(MLX5_GET(wq, wqc, log_wq_stride),
-		      MLX5_GET(wq, wqc, log_wq_sz),
-		      fbc);
-	wq->sz    = wq->fbc.sz_m1 + 1;
-
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
 		return err;
 	}
 
-	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_cyc_get_byte_size(wq),
+	wq->db  = wq_ctrl->db.db;
+
+	err = mlx5_frag_buf_alloc_node(mdev, wq_get_byte_sz(log_wq_sz, log_wq_stride),
 				       &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
 		goto err_db_free;
 	}
 
-	fbc->frag_buf = wq_ctrl->buf;
-	wq->db  = wq_ctrl->db.db;
+	mlx5_init_fbc(wq_ctrl->buf.frags, log_wq_stride, log_wq_sz, fbc);
+	wq->sz = mlx5_wq_cyc_get_size(wq);
 
 	wq_ctrl->mdev = mdev;
 
@@ -113,46 +96,19 @@ err_db_free:
 	return err;
 }
 
-static void mlx5_qp_set_frag_buf(struct mlx5_frag_buf *buf,
-				 struct mlx5_wq_qp *qp)
-{
-	struct mlx5_frag_buf_ctrl *sq_fbc;
-	struct mlx5_frag_buf *rqb, *sqb;
-
-	rqb  = &qp->rq.fbc.frag_buf;
-	*rqb = *buf;
-	rqb->size   = mlx5_wq_cyc_get_byte_size(&qp->rq);
-	rqb->npages = DIV_ROUND_UP(rqb->size, PAGE_SIZE);
-
-	sq_fbc = &qp->sq.fbc;
-	sqb    = &sq_fbc->frag_buf;
-	*sqb   = *buf;
-	sqb->size   = mlx5_wq_cyc_get_byte_size(&qp->sq);
-	sqb->npages = DIV_ROUND_UP(sqb->size, PAGE_SIZE);
-	sqb->frags += rqb->npages; /* first part is for the rq */
-	if (sq_fbc->strides_offset)
-		sqb->frags--;
-}
-
 int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		      void *qpc, struct mlx5_wq_qp *wq,
 		      struct mlx5_wq_ctrl *wq_ctrl)
 {
-	u16 sq_strides_offset;
-	u32 rq_pg_remainder;
-	int err;
+	u8 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride) + 4;
+	u8 log_rq_sz     = MLX5_GET(qpc, qpc, log_rq_size);
+	u8 log_sq_stride = ilog2(MLX5_SEND_WQE_BB);
+	u8 log_sq_sz     = MLX5_GET(qpc, qpc, log_sq_size);
 
-	mlx5_fill_fbc(MLX5_GET(qpc, qpc, log_rq_stride) + 4,
-		      MLX5_GET(qpc, qpc, log_rq_size),
-		      &wq->rq.fbc);
+	u32 rq_byte_size;
+	int err;
 
-	rq_pg_remainder   = mlx5_wq_cyc_get_byte_size(&wq->rq) % PAGE_SIZE;
-	sq_strides_offset = rq_pg_remainder / MLX5_SEND_WQE_BB;
 
-	mlx5_fill_fbc_offset(ilog2(MLX5_SEND_WQE_BB),
-			     MLX5_GET(qpc, qpc, log_sq_size),
-			     sq_strides_offset,
-			     &wq->sq.fbc);
 
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
@@ -160,14 +116,32 @@ int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		return err;
 	}
 
-	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_qp_get_byte_size(wq),
+	err = mlx5_frag_buf_alloc_node(mdev,
+				       wq_get_byte_sz(log_rq_sz, log_rq_stride) +
+				       wq_get_byte_sz(log_sq_sz, log_sq_stride),
 				       &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
 		goto err_db_free;
 	}
 
-	mlx5_qp_set_frag_buf(&wq_ctrl->buf, wq);
+	mlx5_init_fbc(wq_ctrl->buf.frags, log_rq_stride, log_rq_sz, &wq->rq.fbc);
+
+	rq_byte_size = wq_get_byte_sz(log_rq_sz, log_rq_stride);
+
+	if (rq_byte_size < PAGE_SIZE) {
+		/* SQ starts within the same page of the RQ */
+		u16 sq_strides_offset = rq_byte_size / MLX5_SEND_WQE_BB;
+
+		mlx5_init_fbc_offset(wq_ctrl->buf.frags,
+				     log_sq_stride, log_sq_sz, sq_strides_offset,
+				     &wq->sq.fbc);
+	} else {
+		u16 rq_npages = rq_byte_size >> PAGE_SHIFT;
+
+		mlx5_init_fbc(wq_ctrl->buf.frags + rq_npages,
+			      log_sq_stride, log_sq_sz, &wq->sq.fbc);
+	}
 
 	wq->rq.db  = &wq_ctrl->db.db[MLX5_RCV_DBR];
 	wq->sq.db  = &wq_ctrl->db.db[MLX5_SND_DBR];
@@ -186,17 +160,19 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		     void *cqc, struct mlx5_cqwq *wq,
 		     struct mlx5_wq_ctrl *wq_ctrl)
 {
+	u8 log_wq_stride = MLX5_GET(cqc, cqc, cqe_sz) + 6;
+	u8 log_wq_sz     = MLX5_GET(cqc, cqc, log_cq_size);
 	int err;
 
-	mlx5_core_init_cq_frag_buf(&wq->fbc, cqc);
-
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
 		return err;
 	}
 
-	err = mlx5_frag_buf_alloc_node(mdev, mlx5_cqwq_get_byte_size(wq),
+	wq->db  = wq_ctrl->db.db;
+
+	err = mlx5_frag_buf_alloc_node(mdev, wq_get_byte_sz(log_wq_sz, log_wq_stride),
 				       &wq_ctrl->buf,
 				       param->buf_numa_node);
 	if (err) {
@@ -205,8 +181,7 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		goto err_db_free;
 	}
 
-	wq->fbc.frag_buf = wq_ctrl->buf;
-	wq->db  = wq_ctrl->db.db;
+	mlx5_init_fbc(wq_ctrl->buf.frags, log_wq_stride, log_wq_sz, &wq->fbc);
 
 	wq_ctrl->mdev = mdev;
 
@@ -222,30 +197,29 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		      void *wqc, struct mlx5_wq_ll *wq,
 		      struct mlx5_wq_ctrl *wq_ctrl)
 {
+	u8 log_wq_stride = MLX5_GET(wq, wqc, log_wq_stride);
+	u8 log_wq_sz     = MLX5_GET(wq, wqc, log_wq_sz);
 	struct mlx5_frag_buf_ctrl *fbc = &wq->fbc;
 	struct mlx5_wqe_srq_next_seg *next_seg;
 	int err;
 	int i;
 
-	mlx5_fill_fbc(MLX5_GET(wq, wqc, log_wq_stride),
-		      MLX5_GET(wq, wqc, log_wq_sz),
-		      fbc);
-
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
 		return err;
 	}
 
-	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_ll_get_byte_size(wq),
+	wq->db  = wq_ctrl->db.db;
+
+	err = mlx5_frag_buf_alloc_node(mdev, wq_get_byte_sz(log_wq_sz, log_wq_stride),
 				       &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
 		goto err_db_free;
 	}
 
-	wq->fbc.frag_buf = wq_ctrl->buf;
-	wq->db  = wq_ctrl->db.db;
+	mlx5_init_fbc(wq_ctrl->buf.frags, log_wq_stride, log_wq_sz, fbc);
 
 	for (i = 0; i < fbc->sz_m1; i++) {
 		next_seg = mlx5_wq_ll_get_wqe(wq, i);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 94ffd02af7cd..e10f61a1f77d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -357,7 +357,7 @@ struct mlx5_frag_buf {
 };
 
 struct mlx5_frag_buf_ctrl {
-	struct mlx5_frag_buf	frag_buf;
+	struct mlx5_buf_list   *frags;
 	u32			sz_m1;
 	u16			frag_sz_m1;
 	u16			strides_offset;
@@ -994,10 +994,12 @@ static inline u32 mlx5_base_mkey(const u32 key)
 	return key & 0xffffff00u;
 }
 
-static inline void mlx5_fill_fbc_offset(u8 log_stride, u8 log_sz,
+static inline void mlx5_init_fbc_offset(struct mlx5_buf_list *frags,
+					u8 log_stride, u8 log_sz,
 					u16 strides_offset,
 					struct mlx5_frag_buf_ctrl *fbc)
 {
+	fbc->frags      = frags;
 	fbc->log_stride = log_stride;
 	fbc->log_sz     = log_sz;
 	fbc->sz_m1	= (1 << fbc->log_sz) - 1;
@@ -1006,18 +1008,11 @@ static inline void mlx5_fill_fbc_offset(u8 log_stride, u8 log_sz,
 	fbc->strides_offset = strides_offset;
 }
 
-static inline void mlx5_fill_fbc(u8 log_stride, u8 log_sz,
+static inline void mlx5_init_fbc(struct mlx5_buf_list *frags,
+				 u8 log_stride, u8 log_sz,
 				 struct mlx5_frag_buf_ctrl *fbc)
 {
-	mlx5_fill_fbc_offset(log_stride, log_sz, 0, fbc);
-}
-
-static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc,
-					      void *cqc)
-{
-	mlx5_fill_fbc(6 + MLX5_GET(cqc, cqc, cqe_sz),
-		      MLX5_GET(cqc, cqc, log_cq_size),
-		      fbc);
+	mlx5_init_fbc_offset(frags, log_stride, log_sz, 0, fbc);
 }
 
 static inline void *mlx5_frag_buf_get_wqe(struct mlx5_frag_buf_ctrl *fbc,
@@ -1028,8 +1023,7 @@ static inline void *mlx5_frag_buf_get_wqe(struct mlx5_frag_buf_ctrl *fbc,
 	ix  += fbc->strides_offset;
 	frag = ix >> fbc->log_frag_strides;
 
-	return fbc->frag_buf.frags[frag].buf +
-		((fbc->frag_sz_m1 & ix) << fbc->log_stride);
+	return fbc->frags[frag].buf + ((fbc->frag_sz_m1 & ix) << fbc->log_stride);
 }
 
 int mlx5_cmd_init(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From 4b5b9c7d972e8a7b1e7691c7c921ec0d6dec33b9 Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayag@mellanox.com>
Date: Tue, 9 Oct 2018 14:16:43 +0300
Subject: net/mlx5: Add FEC fields to Port Phy Link Mode (PPLM) reg

Added FEC related fields to PPLM layout.
These fields are needed to set and query FEC policy
for different link speeds.

Signed-off-by: Shay Agroskin <shayag@mellanox.com>
Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 39 ++++++++++++++++++++++++++++-----------
 2 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e10f61a1f77d..696ed3f7f894 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -133,6 +133,7 @@ enum {
 	MLX5_REG_PVLC		 = 0x500f,
 	MLX5_REG_PCMR		 = 0x5041,
 	MLX5_REG_PMLP		 = 0x5002,
+	MLX5_REG_PPLM		 = 0x5023,
 	MLX5_REG_PCAM		 = 0x507f,
 	MLX5_REG_NODE_DESC	 = 0x6001,
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 963611820006..47b09a742ae5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -7828,20 +7828,34 @@ struct mlx5_ifc_pplr_reg_bits {
 
 struct mlx5_ifc_pplm_reg_bits {
 	u8         reserved_at_0[0x8];
-	u8         local_port[0x8];
-	u8         reserved_at_10[0x10];
+	u8	   local_port[0x8];
+	u8	   reserved_at_10[0x10];
 
-	u8         reserved_at_20[0x20];
+	u8	   reserved_at_20[0x20];
 
-	u8         port_profile_mode[0x8];
-	u8         static_port_profile[0x8];
-	u8         active_port_profile[0x8];
-	u8         reserved_at_58[0x8];
+	u8	   port_profile_mode[0x8];
+	u8	   static_port_profile[0x8];
+	u8	   active_port_profile[0x8];
+	u8	   reserved_at_58[0x8];
 
-	u8         retransmission_active[0x8];
-	u8         fec_mode_active[0x18];
+	u8	   retransmission_active[0x8];
+	u8	   fec_mode_active[0x18];
 
-	u8         reserved_at_80[0x20];
+	u8	   rs_fec_correction_bypass_cap[0x4];
+	u8	   reserved_at_84[0x8];
+	u8	   fec_override_cap_56g[0x4];
+	u8	   fec_override_cap_100g[0x4];
+	u8	   fec_override_cap_50g[0x4];
+	u8	   fec_override_cap_25g[0x4];
+	u8	   fec_override_cap_10g_40g[0x4];
+
+	u8	   rs_fec_correction_bypass_admin[0x4];
+	u8	   reserved_at_a4[0x8];
+	u8	   fec_override_admin_56g[0x4];
+	u8	   fec_override_admin_100g[0x4];
+	u8	   fec_override_admin_50g[0x4];
+	u8	   fec_override_admin_25g[0x4];
+	u8	   fec_override_admin_10g_40g[0x4];
 };
 
 struct mlx5_ifc_ppcnt_reg_bits {
@@ -8137,7 +8151,10 @@ struct mlx5_ifc_pcam_enhanced_features_bits {
 struct mlx5_ifc_pcam_regs_5000_to_507f_bits {
 	u8         port_access_reg_cap_mask_127_to_96[0x20];
 	u8         port_access_reg_cap_mask_95_to_64[0x20];
-	u8         port_access_reg_cap_mask_63_to_32[0x20];
+
+	u8         port_access_reg_cap_mask_63_to_36[0x1c];
+	u8         pplm[0x1];
+	u8         port_access_reg_cap_mask_34_to_32[0x3];
 
 	u8         port_access_reg_cap_mask_31_to_13[0x13];
 	u8         pbmc[0x1];
-- 
cgit v1.2.3


From 67daf1186086ad4b2ec09b8078b835936977d06a Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayag@mellanox.com>
Date: Sun, 30 Sep 2018 09:58:08 +0300
Subject: net/mlx5: Added "per_lane_error_counters" cap bit to PCAM

Added "Per lane raw errors" capability bit in
Ports Capabilities Mask (PCAM) enhanced features
layout.

This bit determines if the fields "phy_raw_errors_laneX"
in "Physical Layer statistical" counters group are supported.

Signed-off-by: Shay Agroskin <shayag@mellanox.com>
Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 47b09a742ae5..dbff9ff28f2c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8140,7 +8140,8 @@ struct mlx5_ifc_pcam_enhanced_features_bits {
 	u8         rx_icrc_encapsulated_counter[0x1];
 	u8	   reserved_at_6e[0x8];
 	u8         pfcc_mask[0x1];
-	u8         reserved_at_77[0x4];
+	u8         reserved_at_77[0x3];
+	u8         per_lane_error_counters[0x1];
 	u8         rx_buffer_fullness_counters[0x1];
 	u8         ptys_connector_type[0x1];
 	u8         reserved_at_7d[0x1];
-- 
cgit v1.2.3


From 5f1be84aad4b520a36246d0c289ad73641277630 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Fri, 12 Oct 2018 03:01:54 +0900
Subject: netfilter: nf_flow_table: remove unnecessary parameter of
 nf_flow_table_cleanup()

parameter net of nf_flow_table_cleanup() is not used.
So that it can be removed.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h | 2 +-
 net/netfilter/nf_flow_table_core.c    | 2 +-
 net/netfilter/nft_flow_offload.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 0e355f4a3d76..77e2761d4f2f 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -99,7 +99,7 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table,
 			  void (*iter)(struct flow_offload *flow, void *data),
 			  void *data);
 
-void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
+void nf_flow_table_cleanup(struct net_device *dev);
 
 int nf_flow_table_init(struct nf_flowtable *flow_table);
 void nf_flow_table_free(struct nf_flowtable *flow_table);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 185c633b6872..a3cc2ef8a48a 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -483,7 +483,7 @@ static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
 	flush_delayed_work(&flowtable->gc_work);
 }
 
-void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
+void nf_flow_table_cleanup(struct net_device *dev)
 {
 	struct nf_flowtable *flowtable;
 
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index d6bab8c3cbb0..e82d9a966c45 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -201,7 +201,7 @@ static int flow_offload_netdev_event(struct notifier_block *this,
 	if (event != NETDEV_DOWN)
 		return NOTIFY_DONE;
 
-	nf_flow_table_cleanup(dev_net(dev), dev);
+	nf_flow_table_cleanup(dev);
 
 	return NOTIFY_DONE;
 }
-- 
cgit v1.2.3


From 468c041cff57e87f18e1022cacf9f5c98bf00b58 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 18 Oct 2018 22:29:59 +0900
Subject: netfilter: nfnetlink_log: remove empty nfnetlink_log.h header file

/include/net/netfilter/nfnetlink_log.h file is empty.
so that it can be removed.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nfnetlink_log.h | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 include/net/netfilter/nfnetlink_log.h

(limited to 'include')

diff --git a/include/net/netfilter/nfnetlink_log.h b/include/net/netfilter/nfnetlink_log.h
deleted file mode 100644
index ea32a7d3cf1b..000000000000
--- a/include/net/netfilter/nfnetlink_log.h
+++ /dev/null
@@ -1 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-- 
cgit v1.2.3


From af510ebd8913bee016492832f532ed919b51c09c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 19 Oct 2018 11:48:24 +0200
Subject: Revert "netfilter: xt_quota: fix the behavior of xt_quota module"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit e9837e55b0200da544a095a1fca36efd7fd3ba30.

When talking to Maze and Chenbo, we agreed to keep this back by now
due to problems in the ruleset listing path with 32-bit arches.

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_quota.h |  8 ++---
 net/netfilter/xt_quota.c                | 55 ++++++++++++++++++++-------------
 2 files changed, 36 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/xt_quota.h b/include/uapi/linux/netfilter/xt_quota.h
index d72fd52adbba..f3ba5d9e58b6 100644
--- a/include/uapi/linux/netfilter/xt_quota.h
+++ b/include/uapi/linux/netfilter/xt_quota.h
@@ -15,11 +15,9 @@ struct xt_quota_info {
 	__u32 flags;
 	__u32 pad;
 	__aligned_u64 quota;
-#ifdef __KERNEL__
-	atomic64_t counter;
-#else
-	__aligned_u64 remain;
-#endif
+
+	/* Used internally by the kernel */
+	struct xt_quota_priv	*master;
 };
 
 #endif /* _XT_QUOTA_H */
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index fceae245eb03..10d61a6eed71 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -11,6 +11,11 @@
 #include <linux/netfilter/xt_quota.h>
 #include <linux/module.h>
 
+struct xt_quota_priv {
+	spinlock_t	lock;
+	uint64_t	quota;
+};
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
 MODULE_DESCRIPTION("Xtables: countdown quota match");
@@ -21,48 +26,54 @@ static bool
 quota_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	struct xt_quota_info *q = (void *)par->matchinfo;
-	u64 current_count = atomic64_read(&q->counter);
+	struct xt_quota_priv *priv = q->master;
 	bool ret = q->flags & XT_QUOTA_INVERT;
-	u64 old_count, new_count;
-
-	do {
-		if (current_count == 1)
-			return ret;
-		if (current_count <= skb->len) {
-			atomic64_set(&q->counter, 1);
-			return ret;
-		}
-		old_count = current_count;
-		new_count = current_count - skb->len;
-		current_count = atomic64_cmpxchg(&q->counter, old_count,
-						 new_count);
-	} while (current_count != old_count);
-	return !ret;
+
+	spin_lock_bh(&priv->lock);
+	if (priv->quota >= skb->len) {
+		priv->quota -= skb->len;
+		ret = !ret;
+	} else {
+		/* we do not allow even small packets from now on */
+		priv->quota = 0;
+	}
+	spin_unlock_bh(&priv->lock);
+
+	return ret;
 }
 
 static int quota_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_quota_info *q = par->matchinfo;
 
-	BUILD_BUG_ON(sizeof(atomic64_t) != sizeof(__u64));
-
 	if (q->flags & ~XT_QUOTA_MASK)
 		return -EINVAL;
-	if (atomic64_read(&q->counter) > q->quota + 1)
-		return -ERANGE;
 
-	if (atomic64_read(&q->counter) == 0)
-		atomic64_set(&q->counter, q->quota + 1);
+	q->master = kmalloc(sizeof(*q->master), GFP_KERNEL);
+	if (q->master == NULL)
+		return -ENOMEM;
+
+	spin_lock_init(&q->master->lock);
+	q->master->quota = q->quota;
 	return 0;
 }
 
+static void quota_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_quota_info *q = par->matchinfo;
+
+	kfree(q->master);
+}
+
 static struct xt_match quota_mt_reg __read_mostly = {
 	.name       = "quota",
 	.revision   = 0,
 	.family     = NFPROTO_UNSPEC,
 	.match      = quota_mt,
 	.checkentry = quota_mt_check,
+	.destroy    = quota_mt_destroy,
 	.matchsize  = sizeof(struct xt_quota_info),
+	.usersize   = offsetof(struct xt_quota_info, master),
 	.me         = THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From 144991602e6a14d667b295f1b099e609ce857772 Mon Sep 17 00:00:00 2001
From: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Date: Thu, 18 Oct 2018 15:16:09 +0200
Subject: bpf: rename stack trace map operations

In the following patches queue and stack maps (FIFO and LIFO
datastructures) will be implemented.  In order to avoid confusion and
a possible name clash rename stack_map_ops to stack_trace_map_ops

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_types.h | 2 +-
 kernel/bpf/stackmap.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fa48343a5ea1..7bad4e1947ed 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -51,7 +51,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_PERCPU_HASH, htab_lru_percpu_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_LPM_TRIE, trie_map_ops)
 #ifdef CONFIG_PERF_EVENTS
-BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b2ade10f7ec3..90daf285de03 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -600,7 +600,7 @@ static void stack_map_free(struct bpf_map *map)
 	put_callchain_buffers();
 }
 
-const struct bpf_map_ops stack_map_ops = {
+const struct bpf_map_ops stack_trace_map_ops = {
 	.map_alloc = stack_map_alloc,
 	.map_free = stack_map_free,
 	.map_get_next_key = stack_map_get_next_key,
-- 
cgit v1.2.3


From 2ea864c58f19bf70a0e2415f9f1c53814e07f1b4 Mon Sep 17 00:00:00 2001
From: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Date: Thu, 18 Oct 2018 15:16:20 +0200
Subject: bpf/verifier: add ARG_PTR_TO_UNINIT_MAP_VALUE

ARG_PTR_TO_UNINIT_MAP_VALUE argument is a pointer to a memory zone
used to save the value of a map.  Basically the same as
ARG_PTR_TO_UNINIT_MEM, but the size has not be passed as an extra
argument.

This will be used in the following patch that implements some new
helpers that receive a pointer to be filled with a map value.

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   | 1 +
 kernel/bpf/verifier.c | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e60fff48288b..0f8b863e0229 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -138,6 +138,7 @@ enum bpf_arg_type {
 	ARG_CONST_MAP_PTR,	/* const argument used as pointer to bpf_map */
 	ARG_PTR_TO_MAP_KEY,	/* pointer to stack used as map key */
 	ARG_PTR_TO_MAP_VALUE,	/* pointer to stack used as map value */
+	ARG_PTR_TO_UNINIT_MAP_VALUE,	/* pointer to valid memory used to store a map value */
 
 	/* the following constraints used to prototype bpf_memcmp() and other
 	 * functions that access data on eBPF program stack
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3f93a548a642..d84c91ac3b70 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2117,7 +2117,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	}
 
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
-	    arg_type == ARG_PTR_TO_MAP_VALUE) {
+	    arg_type == ARG_PTR_TO_MAP_VALUE ||
+	    arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
 		if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
 		    type != expected_type)
@@ -2187,7 +2188,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		err = check_helper_mem_access(env, regno,
 					      meta->map_ptr->key_size, false,
 					      NULL);
-	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
+	} else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
+		   arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
 		 */
@@ -2196,9 +2198,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
+		meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE);
 		err = check_helper_mem_access(env, regno,
 					      meta->map_ptr->value_size, false,
-					      NULL);
+					      meta);
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
-- 
cgit v1.2.3


From f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92 Mon Sep 17 00:00:00 2001
From: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Date: Thu, 18 Oct 2018 15:16:25 +0200
Subject: bpf: add queue and stack maps

Queue/stack maps implement a FIFO/LIFO data storage for ebpf programs.
These maps support peek, pop and push operations that are exposed to eBPF
programs through the new bpf_map[peek/pop/push] helpers.  Those operations
are exposed to userspace applications through the already existing
syscalls in the following way:

BPF_MAP_LOOKUP_ELEM            -> peek
BPF_MAP_LOOKUP_AND_DELETE_ELEM -> pop
BPF_MAP_UPDATE_ELEM            -> push

Queue/stack maps are implemented using a buffer, tail and head indexes,
hence BPF_F_NO_PREALLOC is not supported.

As opposite to other maps, queue and stack do not use RCU for protecting
maps values, the bpf_map[peek/pop] have a ARG_PTR_TO_UNINIT_MAP_VALUE
argument that is a pointer to a memory zone where to save the value of a
map.  Basically the same as ARG_PTR_TO_UNINIT_MEM, but the size has not
be passed as an extra argument.

Our main motivation for implementing queue/stack maps was to keep track
of a pool of elements, like network ports in a SNAT, however we forsee
other use cases, like for exampling saving last N kernel events in a map
and then analysing from userspace.

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           |   6 +
 include/linux/bpf_types.h     |   2 +
 include/uapi/linux/bpf.h      |  29 ++++-
 kernel/bpf/Makefile           |   2 +-
 kernel/bpf/core.c             |   3 +
 kernel/bpf/helpers.c          |  43 +++++++
 kernel/bpf/queue_stack_maps.c | 288 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c          |   6 +
 kernel/bpf/verifier.c         |  19 ++-
 net/core/filter.c             |   6 +
 10 files changed, 401 insertions(+), 3 deletions(-)
 create mode 100644 kernel/bpf/queue_stack_maps.c

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0f8b863e0229..33014ae73103 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,9 @@ struct bpf_map_ops {
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
 	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
+	int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
+	int (*map_pop_elem)(struct bpf_map *map, void *value);
+	int (*map_peek_elem)(struct bpf_map *map, void *value);
 
 	/* funcs called by prog_array and perf_event_array map */
 	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
@@ -811,6 +814,9 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map,
 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
+extern const struct bpf_func_proto bpf_map_push_elem_proto;
+extern const struct bpf_func_proto bpf_map_pop_elem_proto;
+extern const struct bpf_func_proto bpf_map_peek_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 7bad4e1947ed..44d9ab4809bd 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -69,3 +69,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
 #endif
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5e46f6732781..70082cb626b4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -128,6 +128,8 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CGROUP_STORAGE,
 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
 	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+	BPF_MAP_TYPE_QUEUE,
+	BPF_MAP_TYPE_STACK,
 };
 
 enum bpf_prog_type {
@@ -462,6 +464,28 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
+ * 	Description
+ * 		Push an element *value* in *map*. *flags* is one of:
+ *
+ * 		**BPF_EXIST**
+ * 		If the queue/stack is full, the oldest element is removed to
+ * 		make room for this.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_pop_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Pop an element from *map*.
+ * Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_peek_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Get an element from *map* without removing it.
+ * Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_probe_read(void *dst, u32 size, const void *src)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
@@ -2303,7 +2327,10 @@ union bpf_attr {
 	FN(skb_ancestor_cgroup_id),	\
 	FN(sk_lookup_tcp),		\
 	FN(sk_lookup_udp),		\
-	FN(sk_release),
+	FN(sk_release),			\
+	FN(map_push_elem),		\
+	FN(map_pop_elem),		\
+	FN(map_peek_elem),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index ff8262626b8f..4c2fa3ac56f6 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,7 +3,7 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index defcf4df6d91..7c7eeea8cffc 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1783,6 +1783,9 @@ BPF_CALL_0(bpf_user_rnd_u32)
 const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
 const struct bpf_func_proto bpf_map_update_elem_proto __weak;
 const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
+const struct bpf_func_proto bpf_map_push_elem_proto __weak;
+const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
+const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
 
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6502115e8f55..ab0d5e3f9892 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -76,6 +76,49 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 };
 
+BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags)
+{
+	return map->ops->map_push_elem(map, value, flags);
+}
+
+const struct bpf_func_proto bpf_map_push_elem_proto = {
+	.func		= bpf_map_push_elem,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_MAP_VALUE,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value)
+{
+	return map->ops->map_pop_elem(map, value);
+}
+
+const struct bpf_func_proto bpf_map_pop_elem_proto = {
+	.func		= bpf_map_pop_elem,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MAP_VALUE,
+};
+
+BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
+{
+	return map->ops->map_peek_elem(map, value);
+}
+
+const struct bpf_func_proto bpf_map_peek_elem_proto = {
+	.func		= bpf_map_pop_elem,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MAP_VALUE,
+};
+
 const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 	.func		= bpf_user_rnd_u32,
 	.gpl_only	= false,
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
new file mode 100644
index 000000000000..12a93fb37449
--- /dev/null
+++ b/kernel/bpf/queue_stack_maps.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * queue_stack_maps.c: BPF queue and stack maps
+ *
+ * Copyright (c) 2018 Politecnico di Torino
+ */
+#include <linux/bpf.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include "percpu_freelist.h"
+
+#define QUEUE_STACK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+
+struct bpf_queue_stack {
+	struct bpf_map map;
+	raw_spinlock_t lock;
+	u32 head, tail;
+	u32 size; /* max_entries + 1 */
+
+	char elements[0] __aligned(8);
+};
+
+static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map)
+{
+	return container_of(map, struct bpf_queue_stack, map);
+}
+
+static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs)
+{
+	return qs->head == qs->tail;
+}
+
+static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
+{
+	u32 head = qs->head + 1;
+
+	if (unlikely(head >= qs->size))
+		head = 0;
+
+	return head == qs->tail;
+}
+
+/* Called from syscall */
+static int queue_stack_map_alloc_check(union bpf_attr *attr)
+{
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 0 ||
+	    attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK)
+		return -EINVAL;
+
+	if (attr->value_size > KMALLOC_MAX_SIZE)
+		/* if value_size is bigger, the user space won't be able to
+		 * access the elements.
+		 */
+		return -E2BIG;
+
+	return 0;
+}
+
+static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
+{
+	int ret, numa_node = bpf_map_attr_numa_node(attr);
+	struct bpf_queue_stack *qs;
+	u32 size, value_size;
+	u64 queue_size, cost;
+
+	size = attr->max_entries + 1;
+	value_size = attr->value_size;
+
+	queue_size = sizeof(*qs) + (u64) value_size * size;
+
+	cost = queue_size;
+	if (cost >= U32_MAX - PAGE_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	ret = bpf_map_precharge_memlock(cost);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	qs = bpf_map_area_alloc(queue_size, numa_node);
+	if (!qs)
+		return ERR_PTR(-ENOMEM);
+
+	memset(qs, 0, sizeof(*qs));
+
+	bpf_map_init_from_attr(&qs->map, attr);
+
+	qs->map.pages = cost;
+	qs->size = size;
+
+	raw_spin_lock_init(&qs->lock);
+
+	return &qs->map;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void queue_stack_map_free(struct bpf_map *map)
+{
+	struct bpf_queue_stack *qs = bpf_queue_stack(map);
+
+	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete
+	 */
+	synchronize_rcu();
+
+	bpf_map_area_free(qs);
+}
+
+static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+{
+	struct bpf_queue_stack *qs = bpf_queue_stack(map);
+	unsigned long flags;
+	int err = 0;
+	void *ptr;
+
+	raw_spin_lock_irqsave(&qs->lock, flags);
+
+	if (queue_stack_map_is_empty(qs)) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	ptr = &qs->elements[qs->tail * qs->map.value_size];
+	memcpy(value, ptr, qs->map.value_size);
+
+	if (delete) {
+		if (unlikely(++qs->tail >= qs->size))
+			qs->tail = 0;
+	}
+
+out:
+	raw_spin_unlock_irqrestore(&qs->lock, flags);
+	return err;
+}
+
+
+static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+{
+	struct bpf_queue_stack *qs = bpf_queue_stack(map);
+	unsigned long flags;
+	int err = 0;
+	void *ptr;
+	u32 index;
+
+	raw_spin_lock_irqsave(&qs->lock, flags);
+
+	if (queue_stack_map_is_empty(qs)) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	index = qs->head - 1;
+	if (unlikely(index >= qs->size))
+		index = qs->size - 1;
+
+	ptr = &qs->elements[index * qs->map.value_size];
+	memcpy(value, ptr, qs->map.value_size);
+
+	if (delete)
+		qs->head = index;
+
+out:
+	raw_spin_unlock_irqrestore(&qs->lock, flags);
+	return err;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_peek_elem(struct bpf_map *map, void *value)
+{
+	return __queue_map_get(map, value, false);
+}
+
+/* Called from syscall or from eBPF program */
+static int stack_map_peek_elem(struct bpf_map *map, void *value)
+{
+	return __stack_map_get(map, value, false);
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_pop_elem(struct bpf_map *map, void *value)
+{
+	return __queue_map_get(map, value, true);
+}
+
+/* Called from syscall or from eBPF program */
+static int stack_map_pop_elem(struct bpf_map *map, void *value)
+{
+	return __stack_map_get(map, value, true);
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
+				     u64 flags)
+{
+	struct bpf_queue_stack *qs = bpf_queue_stack(map);
+	unsigned long irq_flags;
+	int err = 0;
+	void *dst;
+
+	/* BPF_EXIST is used to force making room for a new element in case the
+	 * map is full
+	 */
+	bool replace = (flags & BPF_EXIST);
+
+	/* Check supported flags for queue and stack maps */
+	if (flags & BPF_NOEXIST || flags > BPF_EXIST)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&qs->lock, irq_flags);
+
+	if (queue_stack_map_is_full(qs)) {
+		if (!replace) {
+			err = -E2BIG;
+			goto out;
+		}
+		/* advance tail pointer to overwrite oldest element */
+		if (unlikely(++qs->tail >= qs->size))
+			qs->tail = 0;
+	}
+
+	dst = &qs->elements[qs->head * qs->map.value_size];
+	memcpy(dst, value, qs->map.value_size);
+
+	if (unlikely(++qs->head >= qs->size))
+		qs->head = 0;
+
+out:
+	raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
+	return err;
+}
+
+/* Called from syscall or from eBPF program */
+static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_stack_map_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 flags)
+{
+	return -EINVAL;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+{
+	return -EINVAL;
+}
+
+/* Called from syscall */
+static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
+					void *next_key)
+{
+	return -EINVAL;
+}
+
+const struct bpf_map_ops queue_map_ops = {
+	.map_alloc_check = queue_stack_map_alloc_check,
+	.map_alloc = queue_stack_map_alloc,
+	.map_free = queue_stack_map_free,
+	.map_lookup_elem = queue_stack_map_lookup_elem,
+	.map_update_elem = queue_stack_map_update_elem,
+	.map_delete_elem = queue_stack_map_delete_elem,
+	.map_push_elem = queue_stack_map_push_elem,
+	.map_pop_elem = queue_map_pop_elem,
+	.map_peek_elem = queue_map_peek_elem,
+	.map_get_next_key = queue_stack_map_get_next_key,
+};
+
+const struct bpf_map_ops stack_map_ops = {
+	.map_alloc_check = queue_stack_map_alloc_check,
+	.map_alloc = queue_stack_map_alloc,
+	.map_free = queue_stack_map_free,
+	.map_lookup_elem = queue_stack_map_lookup_elem,
+	.map_update_elem = queue_stack_map_update_elem,
+	.map_delete_elem = queue_stack_map_delete_elem,
+	.map_push_elem = queue_stack_map_push_elem,
+	.map_pop_elem = stack_map_pop_elem,
+	.map_peek_elem = stack_map_peek_elem,
+	.map_get_next_key = queue_stack_map_get_next_key,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 78d9dd95e25f..1617407f9ee5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -727,6 +727,9 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_fd_htab_map_lookup_elem(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
 		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+		   map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_peek_elem(map, value);
 	} else {
 		rcu_read_lock();
 		ptr = map->ops->map_lookup_elem(map, key);
@@ -857,6 +860,9 @@ static int map_update_elem(union bpf_attr *attr)
 		/* rcu_read_lock() is not needed */
 		err = bpf_fd_reuseport_array_update_elem(map, key, value,
 							 attr->flags);
+	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+		   map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_push_elem(map, value, attr->flags);
 	} else {
 		rcu_read_lock();
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d84c91ac3b70..7d6d9cf9ebd5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2324,6 +2324,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_sk_select_reuseport)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_QUEUE:
+	case BPF_MAP_TYPE_STACK:
+		if (func_id != BPF_FUNC_map_peek_elem &&
+		    func_id != BPF_FUNC_map_pop_elem &&
+		    func_id != BPF_FUNC_map_push_elem)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2380,6 +2387,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)
 			goto error;
 		break;
+	case BPF_FUNC_map_peek_elem:
+	case BPF_FUNC_map_pop_elem:
+	case BPF_FUNC_map_push_elem:
+		if (map->map_type != BPF_MAP_TYPE_QUEUE &&
+		    map->map_type != BPF_MAP_TYPE_STACK)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2675,7 +2689,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	if (func_id != BPF_FUNC_tail_call &&
 	    func_id != BPF_FUNC_map_lookup_elem &&
 	    func_id != BPF_FUNC_map_update_elem &&
-	    func_id != BPF_FUNC_map_delete_elem)
+	    func_id != BPF_FUNC_map_delete_elem &&
+	    func_id != BPF_FUNC_map_push_elem &&
+	    func_id != BPF_FUNC_map_pop_elem &&
+	    func_id != BPF_FUNC_map_peek_elem)
 		return 0;
 
 	if (meta->map_ptr == NULL) {
diff --git a/net/core/filter.c b/net/core/filter.c
index 1a3ac6c46873..ea48ec789b5c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4876,6 +4876,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_map_update_elem_proto;
 	case BPF_FUNC_map_delete_elem:
 		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_map_push_elem:
+		return &bpf_map_push_elem_proto;
+	case BPF_FUNC_map_pop_elem:
+		return &bpf_map_pop_elem_proto;
+	case BPF_FUNC_map_peek_elem:
+		return &bpf_map_peek_elem_proto;
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_get_smp_processor_id:
-- 
cgit v1.2.3


From bd513cd08f10cbe28856f99ae951e86e86803861 Mon Sep 17 00:00:00 2001
From: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Date: Thu, 18 Oct 2018 15:16:30 +0200
Subject: bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall

The previous patch implemented a bpf queue/stack maps that
provided the peek/pop/push functions.  There is not a direct
relationship between those functions and the current maps
syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added,
this is mapped to the pop operation in the queue/stack maps
and it is still to implement in other kind of maps.

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/syscall.c     | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 70082cb626b4..a2fb333290dc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -103,6 +103,7 @@ enum bpf_cmd {
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
 	BPF_TASK_FD_QUERY,
+	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 };
 
 enum bpf_map_type {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1617407f9ee5..49ae64a26562 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -999,6 +999,69 @@ err_put:
 	return err;
 }
 
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+
+static int map_lookup_and_delete_elem(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_user_ptr(attr->key);
+	void __user *uvalue = u64_to_user_ptr(attr->value);
+	int ufd = attr->map_fd;
+	struct bpf_map *map;
+	void *key, *value, *ptr;
+	u32 value_size;
+	struct fd f;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
+		return -EINVAL;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
+	key = __bpf_copy_key(ukey, map->key_size);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		goto err_put;
+	}
+
+	value_size = map->value_size;
+
+	err = -ENOMEM;
+	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+	if (!value)
+		goto free_key;
+
+	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+	    map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_pop_elem(map, value);
+	} else {
+		err = -ENOTSUPP;
+	}
+
+	if (err)
+		goto free_value;
+
+	if (copy_to_user(uvalue, value, value_size) != 0)
+		goto free_value;
+
+	err = 0;
+
+free_value:
+	kfree(value);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
 static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #define BPF_PROG_TYPE(_id, _name) \
 	[_id] = & _name ## _prog_ops,
@@ -2472,6 +2535,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_TASK_FD_QUERY:
 		err = bpf_task_fd_query(&attr, uattr);
 		break;
+	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
+		err = map_lookup_and_delete_elem(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From b39b5f411dcfce28ff954e5d6acb2c11be3cb0ec Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 19 Oct 2018 09:57:57 -0700
Subject: bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB

BPF programs of BPF_PROG_TYPE_CGROUP_SKB need to access headers in the
skb. This patch enables direct access of skb for these programs.

Two helper functions bpf_compute_and_save_data_end() and
bpf_restore_data_end() are introduced. There are used in
__cgroup_bpf_run_filter_skb(), to compute proper data_end for the
BPF program, and restore original data afterwards.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 21 +++++++++++++++++++++
 kernel/bpf/cgroup.c    |  6 ++++++
 net/core/filter.c      | 36 +++++++++++++++++++++++++++++++++++-
 3 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5771874bc01e..91b4c934f02e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -548,6 +548,27 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
+/* Similar to bpf_compute_data_pointers(), except that save orginal
+ * data in cb->data and cb->meta_data for restore.
+ */
+static inline void bpf_compute_and_save_data_end(
+	struct sk_buff *skb, void **saved_data_end)
+{
+	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+
+	*saved_data_end = cb->data_end;
+	cb->data_end  = skb->data + skb_headlen(skb);
+}
+
+/* Restore data saved by bpf_compute_data_pointers(). */
+static inline void bpf_restore_data_end(
+	struct sk_buff *skb, void *saved_data_end)
+{
+	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+
+	cb->data_end = saved_data_end;
+}
+
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 {
 	/* eBPF programs may read/write skb->cb[] area to transfer meta
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 00f6ed2e4f9a..9425c2fb872f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -553,6 +553,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 {
 	unsigned int offset = skb->data - skb_network_header(skb);
 	struct sock *save_sk;
+	void *saved_data_end;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -566,8 +567,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	save_sk = skb->sk;
 	skb->sk = sk;
 	__skb_push(skb, offset);
+
+	/* compute pointers for the bpf prog */
+	bpf_compute_and_save_data_end(skb, &saved_data_end);
+
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
 				 bpf_prog_run_save_cb);
+	bpf_restore_data_end(skb, saved_data_end);
 	__skb_pull(skb, offset);
 	skb->sk = save_sk;
 	return ret == 1 ? 0 : -EPERM;
diff --git a/net/core/filter.c b/net/core/filter.c
index ea48ec789b5c..5fd5139e8638 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5352,6 +5352,40 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
+static bool cg_skb_is_valid_access(int off, int size,
+				   enum bpf_access_type type,
+				   const struct bpf_prog *prog,
+				   struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
+		return false;
+	}
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case bpf_ctx_range(struct __sk_buff, mark):
+		case bpf_ctx_range(struct __sk_buff, priority):
+		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case bpf_ctx_range(struct __sk_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
+	return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
 static bool lwt_is_valid_access(int off, int size,
 				enum bpf_access_type type,
 				const struct bpf_prog *prog,
@@ -7044,7 +7078,7 @@ const struct bpf_prog_ops xdp_prog_ops = {
 
 const struct bpf_verifier_ops cg_skb_verifier_ops = {
 	.get_func_proto		= cg_skb_func_proto,
-	.is_valid_access	= sk_filter_is_valid_access,
+	.is_valid_access	= cg_skb_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
-- 
cgit v1.2.3


From 5032d079909d1ac5c2535acc32d5f01cd245d8ea Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 18 Oct 2018 13:58:35 -0700
Subject: bpf: skmsg, fix psock create on existing kcm/tls port

Before using the psock returned by sk_psock_get() when adding it to a
sockmap we need to ensure it is actually a sockmap based psock.
Previously we were only checking this after incrementing the reference
counter which was an error. This resulted in a slab-out-of-bounds
error when the psock was not actually a sockmap type.

This moves the check up so the reference counter is only used
if it is a sockmap psock.

Eric reported the following KASAN BUG,

BUG: KASAN: slab-out-of-bounds in atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
BUG: KASAN: slab-out-of-bounds in refcount_inc_not_zero_checked+0x97/0x2f0 lib/refcount.c:120
Read of size 4 at addr ffff88019548be58 by task syz-executor4/22387

CPU: 1 PID: 22387 Comm: syz-executor4 Not tainted 4.19.0-rc7+ #264
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c4/0x2b4 lib/dump_stack.c:113
 print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412
 check_memory_region_inline mm/kasan/kasan.c:260 [inline]
 check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
 kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272
 atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
 refcount_inc_not_zero_checked+0x97/0x2f0 lib/refcount.c:120
 sk_psock_get include/linux/skmsg.h:379 [inline]
 sock_map_link.isra.6+0x41f/0xe30 net/core/sock_map.c:178
 sock_hash_update_common+0x19b/0x11e0 net/core/sock_map.c:669
 sock_hash_update_elem+0x306/0x470 net/core/sock_map.c:738
 map_update_elem+0x819/0xdf0 kernel/bpf/syscall.c:818

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skmsg.h | 25 ++++++++++++++++++++-----
 net/core/sock_map.c   | 11 ++++++-----
 2 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 22347b08e1f8..84e18863f6a4 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -270,11 +270,6 @@ static inline struct sk_psock *sk_psock(const struct sock *sk)
 	return rcu_dereference_sk_user_data(sk);
 }
 
-static inline bool sk_has_psock(struct sock *sk)
-{
-	return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg;
-}
-
 static inline void sk_psock_queue_msg(struct sk_psock *psock,
 				      struct sk_msg *msg)
 {
@@ -374,6 +369,26 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock,
 	return test_bit(bit, &psock->state);
 }
 
+static inline struct sk_psock *sk_psock_get_checked(struct sock *sk)
+{
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (psock) {
+		if (sk->sk_prot->recvmsg != tcp_bpf_recvmsg) {
+			psock = ERR_PTR(-EBUSY);
+			goto out;
+		}
+
+		if (!refcount_inc_not_zero(&psock->refcnt))
+			psock = ERR_PTR(-EBUSY);
+	}
+out:
+	rcu_read_unlock();
+	return psock;
+}
+
 static inline struct sk_psock *sk_psock_get(struct sock *sk)
 {
 	struct sk_psock *psock;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 3c0e44cb811a..be6092ac69f8 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -175,12 +175,13 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 		}
 	}
 
-	psock = sk_psock_get(sk);
+	psock = sk_psock_get_checked(sk);
+	if (IS_ERR(psock)) {
+		ret = PTR_ERR(psock);
+		goto out_progs;
+	}
+
 	if (psock) {
-		if (!sk_has_psock(sk)) {
-			ret = -EBUSY;
-			goto out_progs;
-		}
 		if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
 		    (skb_progs  && READ_ONCE(psock->progs.skb_parser))) {
 			sk_psock_put(sk, psock);
-- 
cgit v1.2.3


From c9fbd71f73094311b31ee703a918e9e0df502cef Mon Sep 17 00:00:00 2001
From: Debabrata Banerjee <dbanerje@akamai.com>
Date: Thu, 18 Oct 2018 11:18:26 -0400
Subject: netpoll: allow cleanup to be synchronous

This fixes a problem introduced by:
commit 2cde6acd49da ("netpoll: Fix __netpoll_rcu_free so that it can hold the rtnl lock")

When using netconsole on a bond, __netpoll_cleanup can asynchronously
recurse multiple times, each __netpoll_free_async call can result in
more __netpoll_free_async's. This means there is now a race between
cleanup_work queues on multiple netpoll_info's on multiple devices and
the configuration of a new netpoll. For example if a netconsole is set
to enable 0, reconfigured, and enable 1 immediately, this netconsole
will likely not work.

Given the reason for __netpoll_free_async is it can be called when rtnl
is not locked, if it is locked, we should be able to execute
synchronously. It appears to be locked everywhere it's called from.

Generalize the design pattern from the teaming driver for current
callers of __netpoll_free_async.

CC: Neil Horman <nhorman@tuxdriver.com>
CC: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  3 ++-
 drivers/net/macvlan.c           |  2 +-
 drivers/net/team/team.c         |  5 +----
 include/linux/netpoll.h         |  4 +---
 net/8021q/vlan_dev.c            |  3 +--
 net/bridge/br_device.c          |  2 +-
 net/core/netpoll.c              | 21 +++++----------------
 net/dsa/slave.c                 |  2 +-
 8 files changed, 13 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index ee28ec9e0aba..ffa37adb7681 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -963,7 +963,8 @@ static inline void slave_disable_netpoll(struct slave *slave)
 		return;
 
 	slave->np = NULL;
-	__netpoll_free_async(np);
+
+	__netpoll_free(np);
 }
 
 static void bond_poll_controller(struct net_device *bond_dev)
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index cfda146f3b3b..fc8d5f1ee1ad 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1077,7 +1077,7 @@ static void macvlan_dev_netpoll_cleanup(struct net_device *dev)
 
 	vlan->netpoll = NULL;
 
-	__netpoll_free_async(netpoll);
+	__netpoll_free(netpoll);
 }
 #endif	/* CONFIG_NET_POLL_CONTROLLER */
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index d887016e54b6..db633ae9f784 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1104,10 +1104,7 @@ static void team_port_disable_netpoll(struct team_port *port)
 		return;
 	port->np = NULL;
 
-	/* Wait for transmitting packets to finish before freeing. */
-	synchronize_rcu_bh();
-	__netpoll_cleanup(np);
-	kfree(np);
+	__netpoll_free(np);
 }
 #else
 static int team_port_enable_netpoll(struct team_port *port)
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 3ef82d3a78db..676f1ff161a9 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -31,8 +31,6 @@ struct netpoll {
 	bool ipv6;
 	u16 local_port, remote_port;
 	u8 remote_mac[ETH_ALEN];
-
-	struct work_struct cleanup_work;
 };
 
 struct netpoll_info {
@@ -63,7 +61,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt);
 int __netpoll_setup(struct netpoll *np, struct net_device *ndev);
 int netpoll_setup(struct netpoll *np);
 void __netpoll_cleanup(struct netpoll *np);
-void __netpoll_free_async(struct netpoll *np);
+void __netpoll_free(struct netpoll *np);
 void netpoll_cleanup(struct netpoll *np);
 void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 			     struct net_device *dev);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 546af0e73ac3..ff720f1ebf73 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -756,8 +756,7 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev)
 		return;
 
 	vlan->netpoll = NULL;
-
-	__netpoll_free_async(netpoll);
+	__netpoll_free(netpoll);
 }
 #endif /* CONFIG_NET_POLL_CONTROLLER */
 
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index e053a4e43758..c6abf927f0c9 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -344,7 +344,7 @@ void br_netpoll_disable(struct net_bridge_port *p)
 
 	p->np = NULL;
 
-	__netpoll_free_async(np);
+	__netpoll_free(np);
 }
 
 #endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 3ae899805f8b..5da9552b186b 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -57,7 +57,6 @@ DEFINE_STATIC_SRCU(netpoll_srcu);
 	 MAX_UDP_CHUNK)
 
 static void zap_completion_queue(void);
-static void netpoll_async_cleanup(struct work_struct *work);
 
 static unsigned int carrier_timeout = 4;
 module_param(carrier_timeout, uint, 0644);
@@ -589,7 +588,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
 
 	np->dev = ndev;
 	strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
-	INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
 
 	if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
 		np_err(np, "%s doesn't support polling, aborting\n",
@@ -788,10 +786,6 @@ void __netpoll_cleanup(struct netpoll *np)
 {
 	struct netpoll_info *npinfo;
 
-	/* rtnl_dereference would be preferable here but
-	 * rcu_cleanup_netpoll path can put us in here safely without
-	 * holding the rtnl, so plain rcu_dereference it is
-	 */
 	npinfo = rtnl_dereference(np->dev->npinfo);
 	if (!npinfo)
 		return;
@@ -812,21 +806,16 @@ void __netpoll_cleanup(struct netpoll *np)
 }
 EXPORT_SYMBOL_GPL(__netpoll_cleanup);
 
-static void netpoll_async_cleanup(struct work_struct *work)
+void __netpoll_free(struct netpoll *np)
 {
-	struct netpoll *np = container_of(work, struct netpoll, cleanup_work);
+	ASSERT_RTNL();
 
-	rtnl_lock();
+	/* Wait for transmitting packets to finish before freeing. */
+	synchronize_rcu_bh();
 	__netpoll_cleanup(np);
-	rtnl_unlock();
 	kfree(np);
 }
-
-void __netpoll_free_async(struct netpoll *np)
-{
-	schedule_work(&np->cleanup_work);
-}
-EXPORT_SYMBOL_GPL(__netpoll_free_async);
+EXPORT_SYMBOL_GPL(__netpoll_free);
 
 void netpoll_cleanup(struct netpoll *np)
 {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 5428ef529019..7d0c19e7edcf 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -722,7 +722,7 @@ static void dsa_slave_netpoll_cleanup(struct net_device *dev)
 
 	p->netpoll = NULL;
 
-	__netpoll_free_async(netpoll);
+	__netpoll_free(netpoll);
 }
 
 static void dsa_slave_poll_controller(struct net_device *dev)
-- 
cgit v1.2.3


From bff5b4b3737219195ca0caef4ff7884303cb5dc1 Mon Sep 17 00:00:00 2001
From: Yuiko Oshino <yuiko.oshino@microchip.com>
Date: Thu, 18 Oct 2018 15:06:01 -0400
Subject: net: phy: micrel: add Microchip KSZ9131 initial driver

Add support for Microchip Technology KSZ9131 10/100/1000 Ethernet PHY

Signed-off-by: Yuiko Oshino <yuiko.oshino@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/micrel.c   | 130 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/micrel_phy.h |   1 +
 2 files changed, 130 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 3db06b40580d..9265dea79412 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -14,7 +14,7 @@
  * option) any later version.
  *
  * Support : Micrel Phys:
- *		Giga phys: ksz9021, ksz9031
+ *		Giga phys: ksz9021, ksz9031, ksz9131
  *		100/10 Phys : ksz8001, ksz8721, ksz8737, ksz8041
  *			   ksz8021, ksz8031, ksz8051,
  *			   ksz8081, ksz8091,
@@ -609,6 +609,116 @@ err_force_master:
 	return result;
 }
 
+#define KSZ9131_SKEW_5BIT_MAX	2400
+#define KSZ9131_SKEW_4BIT_MAX	800
+#define KSZ9131_OFFSET		700
+#define KSZ9131_STEP		100
+
+static int ksz9131_of_load_skew_values(struct phy_device *phydev,
+				       struct device_node *of_node,
+				       u16 reg, size_t field_sz,
+				       char *field[], u8 numfields)
+{
+	int val[4] = {-(1 + KSZ9131_OFFSET), -(2 + KSZ9131_OFFSET),
+		      -(3 + KSZ9131_OFFSET), -(4 + KSZ9131_OFFSET)};
+	int skewval, skewmax = 0;
+	int matches = 0;
+	u16 maxval;
+	u16 newval;
+	u16 mask;
+	int i;
+
+	/* psec properties in dts should mean x pico seconds */
+	if (field_sz == 5)
+		skewmax = KSZ9131_SKEW_5BIT_MAX;
+	else
+		skewmax = KSZ9131_SKEW_4BIT_MAX;
+
+	for (i = 0; i < numfields; i++)
+		if (!of_property_read_s32(of_node, field[i], &skewval)) {
+			if (skewval < -KSZ9131_OFFSET)
+				skewval = -KSZ9131_OFFSET;
+			else if (skewval > skewmax)
+				skewval = skewmax;
+
+			val[i] = skewval + KSZ9131_OFFSET;
+			matches++;
+		}
+
+	if (!matches)
+		return 0;
+
+	if (matches < numfields)
+		newval = ksz9031_extended_read(phydev, OP_DATA, 2, reg);
+	else
+		newval = 0;
+
+	maxval = (field_sz == 4) ? 0xf : 0x1f;
+	for (i = 0; i < numfields; i++)
+		if (val[i] != -(i + 1 + KSZ9131_OFFSET)) {
+			mask = 0xffff;
+			mask ^= maxval << (field_sz * i);
+			newval = (newval & mask) |
+				(((val[i] / KSZ9131_STEP) & maxval)
+					<< (field_sz * i));
+		}
+
+	return ksz9031_extended_write(phydev, OP_DATA, 2, reg, newval);
+}
+
+static int ksz9131_config_init(struct phy_device *phydev)
+{
+	const struct device *dev = &phydev->mdio.dev;
+	struct device_node *of_node = dev->of_node;
+	char *clk_skews[2] = {"rxc-skew-psec", "txc-skew-psec"};
+	char *rx_data_skews[4] = {
+		"rxd0-skew-psec", "rxd1-skew-psec",
+		"rxd2-skew-psec", "rxd3-skew-psec"
+	};
+	char *tx_data_skews[4] = {
+		"txd0-skew-psec", "txd1-skew-psec",
+		"txd2-skew-psec", "txd3-skew-psec"
+	};
+	char *control_skews[2] = {"txen-skew-psec", "rxdv-skew-psec"};
+	const struct device *dev_walker;
+	int ret;
+
+	dev_walker = &phydev->mdio.dev;
+	do {
+		of_node = dev_walker->of_node;
+		dev_walker = dev_walker->parent;
+	} while (!of_node && dev_walker);
+
+	if (!of_node)
+		return 0;
+
+	ret = ksz9131_of_load_skew_values(phydev, of_node,
+					  MII_KSZ9031RN_CLK_PAD_SKEW, 5,
+					  clk_skews, 2);
+	if (ret < 0)
+		return ret;
+
+	ret = ksz9131_of_load_skew_values(phydev, of_node,
+					  MII_KSZ9031RN_CONTROL_PAD_SKEW, 4,
+					  control_skews, 2);
+	if (ret < 0)
+		return ret;
+
+	ret = ksz9131_of_load_skew_values(phydev, of_node,
+					  MII_KSZ9031RN_RX_DATA_PAD_SKEW, 4,
+					  rx_data_skews, 4);
+	if (ret < 0)
+		return ret;
+
+	ret = ksz9131_of_load_skew_values(phydev, of_node,
+					  MII_KSZ9031RN_TX_DATA_PAD_SKEW, 4,
+					  tx_data_skews, 4);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
 #define KSZ8873MLL_GLOBAL_CONTROL_4	0x06
 #define KSZ8873MLL_GLOBAL_CONTROL_4_DUPLEX	BIT(6)
 #define KSZ8873MLL_GLOBAL_CONTROL_4_SPEED	BIT(4)
@@ -974,6 +1084,23 @@ static struct phy_driver ksphy_driver[] = {
 	.get_stats	= kszphy_get_stats,
 	.suspend	= genphy_suspend,
 	.resume		= kszphy_resume,
+}, {
+	.phy_id		= PHY_ID_KSZ9131,
+	.phy_id_mask	= MICREL_PHY_ID_MASK,
+	.name		= "Microchip KSZ9131 Gigabit PHY",
+	.features	= PHY_GBIT_FEATURES,
+	.flags		= PHY_HAS_INTERRUPT,
+	.driver_data	= &ksz9021_type,
+	.probe		= kszphy_probe,
+	.config_init	= ksz9131_config_init,
+	.read_status	= ksz9031_read_status,
+	.ack_interrupt	= kszphy_ack_interrupt,
+	.config_intr	= kszphy_config_intr,
+	.get_sset_count = kszphy_get_sset_count,
+	.get_strings	= kszphy_get_strings,
+	.get_stats	= kszphy_get_stats,
+	.suspend	= genphy_suspend,
+	.resume		= kszphy_resume,
 }, {
 	.phy_id		= PHY_ID_KSZ8873MLL,
 	.phy_id_mask	= MICREL_PHY_ID_MASK,
@@ -1022,6 +1149,7 @@ MODULE_LICENSE("GPL");
 static struct mdio_device_id __maybe_unused micrel_tbl[] = {
 	{ PHY_ID_KSZ9021, 0x000ffffe },
 	{ PHY_ID_KSZ9031, MICREL_PHY_ID_MASK },
+	{ PHY_ID_KSZ9131, MICREL_PHY_ID_MASK },
 	{ PHY_ID_KSZ8001, 0x00fffffc },
 	{ PHY_ID_KS8737, MICREL_PHY_ID_MASK },
 	{ PHY_ID_KSZ8021, 0x00ffffff },
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 472fa4d4ea62..7361cd3fddc1 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -31,6 +31,7 @@
 #define PHY_ID_KSZ8081		0x00221560
 #define PHY_ID_KSZ8061		0x00221570
 #define PHY_ID_KSZ9031		0x00221620
+#define PHY_ID_KSZ9131		0x00221640
 
 #define PHY_ID_KSZ886X		0x00221430
 #define PHY_ID_KSZ8863		0x00221435
-- 
cgit v1.2.3


From 6fff607e2f14bd7c63c06c464a6f93b8efbabe28 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 19 Oct 2018 19:56:49 -0700
Subject: bpf: sk_msg program helper bpf_msg_push_data

This allows user to push data into a msg using sk_msg program types.
The format is as follows,

	bpf_msg_push_data(msg, offset, len, flags)

this will insert 'len' bytes at offset 'offset'. For example to
prepend 10 bytes at the front of the message the user can,

	bpf_msg_push_data(msg, 0, 10, 0);

This will invalidate data bounds so BPF user will have to then recheck
data bounds after calling this. After this the msg size will have been
updated and the user is free to write into the added bytes. We allow
any offset/len as long as it is within the (data, data_end) range.
However, a copy will be required if the ring is full and its possible
for the helper to fail with ENOMEM or EINVAL errors which need to be
handled by the BPF program.

This can be used similar to XDP metadata to pass data between sk_msg
layer and lower layers.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skmsg.h    |   5 ++
 include/uapi/linux/bpf.h |  20 ++++++-
 net/core/filter.c        | 134 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 158 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 84e18863f6a4..2a11e9d91dfa 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -207,6 +207,11 @@ static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which)
 	return &msg->sg.data[which];
 }
 
+static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which)
+{
+	return msg->sg.data[which];
+}
+
 static inline struct page *sk_msg_page(struct sk_msg *msg, int which)
 {
 	return sg_page(sk_msg_elem(msg, which));
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a2fb333290dc..852dc17ab47a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2240,6 +2240,23 @@ union bpf_attr {
  *		pointer that was returned from bpf_sk_lookup_xxx\ ().
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
+ *	Description
+ *		For socket policies, insert *len* bytes into msg at offset
+ *		*start*.
+ *
+ *		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ *		*msg* it may want to insert metadata or options into the msg.
+ *		This can later be read and used by any of the lower layer BPF
+ *		hooks.
+ *
+ *		This helper may fail if under memory pressure (a malloc
+ *		fails) in these cases BPF programs will get an appropriate
+ *		error and BPF programs will need to handle them.
+ *
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2331,7 +2348,8 @@ union bpf_attr {
 	FN(sk_release),			\
 	FN(map_push_elem),		\
 	FN(map_pop_elem),		\
-	FN(map_peek_elem),
+	FN(map_peek_elem),		\
+	FN(msg_push_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 5fd5139e8638..35c6933c2622 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2297,6 +2297,137 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
+	   u32, len, u64, flags)
+{
+	struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
+	u32 new, i = 0, l, space, copy = 0, offset = 0;
+	u8 *raw, *to, *from;
+	struct page *page;
+
+	if (unlikely(flags))
+		return -EINVAL;
+
+	/* First find the starting scatterlist element */
+	i = msg->sg.start;
+	do {
+		l = sk_msg_elem(msg, i)->length;
+
+		if (start < offset + l)
+			break;
+		offset += l;
+		sk_msg_iter_var_next(i);
+	} while (i != msg->sg.end);
+
+	if (start >= offset + l)
+		return -EINVAL;
+
+	space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+	/* If no space available will fallback to copy, we need at
+	 * least one scatterlist elem available to push data into
+	 * when start aligns to the beginning of an element or two
+	 * when it falls inside an element. We handle the start equals
+	 * offset case because its the common case for inserting a
+	 * header.
+	 */
+	if (!space || (space == 1 && start != offset))
+		copy = msg->sg.data[i].length;
+
+	page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
+			   get_order(copy + len));
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	if (copy) {
+		int front, back;
+
+		raw = page_address(page);
+
+		psge = sk_msg_elem(msg, i);
+		front = start - offset;
+		back = psge->length - front;
+		from = sg_virt(psge);
+
+		if (front)
+			memcpy(raw, from, front);
+
+		if (back) {
+			from += front;
+			to = raw + front + len;
+
+			memcpy(to, from, back);
+		}
+
+		put_page(sg_page(psge));
+	} else if (start - offset) {
+		psge = sk_msg_elem(msg, i);
+		rsge = sk_msg_elem_cpy(msg, i);
+
+		psge->length = start - offset;
+		rsge.length -= psge->length;
+		rsge.offset += start;
+
+		sk_msg_iter_var_next(i);
+		sg_unmark_end(psge);
+		sk_msg_iter_next(msg, end);
+	}
+
+	/* Slot(s) to place newly allocated data */
+	new = i;
+
+	/* Shift one or two slots as needed */
+	if (!copy) {
+		sge = sk_msg_elem_cpy(msg, i);
+
+		sk_msg_iter_var_next(i);
+		sg_unmark_end(&sge);
+		sk_msg_iter_next(msg, end);
+
+		nsge = sk_msg_elem_cpy(msg, i);
+		if (rsge.length) {
+			sk_msg_iter_var_next(i);
+			nnsge = sk_msg_elem_cpy(msg, i);
+		}
+
+		while (i != msg->sg.end) {
+			msg->sg.data[i] = sge;
+			sge = nsge;
+			sk_msg_iter_var_next(i);
+			if (rsge.length) {
+				nsge = nnsge;
+				nnsge = sk_msg_elem_cpy(msg, i);
+			} else {
+				nsge = sk_msg_elem_cpy(msg, i);
+			}
+		}
+	}
+
+	/* Place newly allocated data buffer */
+	sk_mem_charge(msg->sk, len);
+	msg->sg.size += len;
+	msg->sg.copy[new] = false;
+	sg_set_page(&msg->sg.data[new], page, len + copy, 0);
+	if (rsge.length) {
+		get_page(sg_page(&rsge));
+		sk_msg_iter_var_next(new);
+		msg->sg.data[new] = rsge;
+	}
+
+	sk_msg_compute_data_pointers(msg);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_push_data_proto = {
+	.func		= bpf_msg_push_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -4854,6 +4985,7 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_xdp_adjust_head ||
 	    func == bpf_xdp_adjust_meta ||
 	    func == bpf_msg_pull_data ||
+	    func == bpf_msg_push_data ||
 	    func == bpf_xdp_adjust_tail ||
 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
 	    func == bpf_lwt_seg6_store_bytes ||
@@ -5130,6 +5262,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_msg_cork_bytes_proto;
 	case BPF_FUNC_msg_pull_data:
 		return &bpf_msg_pull_data_proto;
+	case BPF_FUNC_msg_push_data:
+		return &bpf_msg_push_data_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
 	default:
-- 
cgit v1.2.3


From c16ee04c9b305d57719344922c4d48379e206a79 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 21 Oct 2018 02:09:23 +0200
Subject: ulp: remove uid and user_visible members

They are not used anymore and therefore should be removed.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/tcp.h  | 7 -------
 net/tls/tls_main.c | 2 --
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 14fdd7ce9992..8a61c3e8c15d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2051,11 +2051,6 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 #define TCP_ULP_MAX		128
 #define TCP_ULP_BUF_MAX		(TCP_ULP_NAME_MAX*TCP_ULP_MAX)
 
-enum {
-	TCP_ULP_TLS,
-	TCP_ULP_BPF,
-};
-
 struct tcp_ulp_ops {
 	struct list_head	list;
 
@@ -2064,9 +2059,7 @@ struct tcp_ulp_ops {
 	/* cleanup ulp */
 	void (*release)(struct sock *sk);
 
-	int		uid;
 	char		name[TCP_ULP_NAME_MAX];
-	bool		user_visible;
 	struct module	*owner;
 };
 int tcp_register_ulp(struct tcp_ulp_ops *type);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index e90b6d537077..311cec8e533d 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -715,8 +715,6 @@ EXPORT_SYMBOL(tls_unregister_device);
 
 static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.name			= "tls",
-	.uid			= TCP_ULP_TLS,
-	.user_visible		= true,
 	.owner			= THIS_MODULE,
 	.init			= tls_init,
 };
-- 
cgit v1.2.3


From 81fa7a69c2174ed8de314b9c231ef30a8718e5e1 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 19 Oct 2018 11:19:13 +0200
Subject: dt-bindings: phy: Update SERDES_MAX to be SERDES_MAX + 1

SERDES_MAX is a valid value to index ctrl->phys in
drivers/phy/mscc/phy-ocelot-serdes.c. But, currently,
there is an out-of-bounds bug in the mentioned driver
when reading from ctrl->phys, because the size of
array ctrl->phys is SERDES_MAX.

Partially fix this by updating SERDES_MAX to be SERDES6G_MAX + 1.

Notice that this is the first part of the solution to
the out-of-bounds bug mentioned above. Although this
change is not dependent on any other one.

Suggested-by: Quentin Schulz <quentin.schulz@bootlin.com>
Reviewed-by: Quentin Schulz <quentin.schulz@bootlin.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/dt-bindings/phy/phy-ocelot-serdes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/phy/phy-ocelot-serdes.h b/include/dt-bindings/phy/phy-ocelot-serdes.h
index bd28f21206f6..fe70adaca68f 100644
--- a/include/dt-bindings/phy/phy-ocelot-serdes.h
+++ b/include/dt-bindings/phy/phy-ocelot-serdes.h
@@ -7,6 +7,6 @@
 #define SERDES1G_MAX	SERDES1G(5)
 #define SERDES6G(x)	(SERDES1G_MAX + 1 + (x))
 #define SERDES6G_MAX	SERDES6G(2)
-#define SERDES_MAX	SERDES6G_MAX
+#define SERDES_MAX	(SERDES6G_MAX + 1)
 
 #endif
-- 
cgit v1.2.3


From 876dcf2f3aaa0f68d437b368b93a4c4b81521191 Mon Sep 17 00:00:00 2001
From: Olivier Brunel <jjk@jjacky.com>
Date: Sat, 20 Oct 2018 19:39:56 +0200
Subject: umh: Add command line to user mode helpers

User mode helpers were spawned without a command line, and because
an empty command line is used by many tools to identify processes as
kernel threads, this could cause some issues.

Notably during killing spree on shutdown, since such helper would then
be skipped (i.e. not killed) which would result in the process remaining
alive, and thus preventing unmouting of the rootfs (as experienced with
the bpfilter umh).

Fixes: 449325b52b7a ("umh: introduce fork_usermode_blob() helper")
Signed-off-by: Olivier Brunel <jjk@jjacky.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/umh.h |  1 +
 kernel/umh.c        | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/umh.h b/include/linux/umh.h
index 5c812acbb80a..235f51b62c71 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -44,6 +44,7 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 struct umh_info {
+	const char *cmdline;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
 	pid_t pid;
diff --git a/kernel/umh.c b/kernel/umh.c
index c449858946af..0baa672e023c 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -405,11 +405,19 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
 		void (*cleanup)(struct subprocess_info *info), void *data)
 {
 	struct subprocess_info *sub_info;
+	struct umh_info *info = data;
+	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 
 	sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
 	if (!sub_info)
 		return NULL;
 
+	sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	if (!sub_info->argv) {
+		kfree(sub_info);
+		return NULL;
+	}
+
 	INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
 	sub_info->path = "none";
 	sub_info->file = file;
@@ -458,10 +466,11 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 	return 0;
 }
 
-static void umh_save_pid(struct subprocess_info *info)
+static void umh_clean_and_save_pid(struct subprocess_info *info)
 {
 	struct umh_info *umh_info = info->data;
 
+	argv_free(info->argv);
 	umh_info->pid = info->pid;
 }
 
@@ -471,6 +480,9 @@ static void umh_save_pid(struct subprocess_info *info)
  * @len: length of the blob
  * @info: information about usermode process (shouldn't be NULL)
  *
+ * If info->cmdline is set it will be used as command line for the
+ * user process, else "usermodehelper" is used.
+ *
  * Returns either negative error or zero which indicates success
  * in executing a blob of bytes as a usermode process. In such
  * case 'struct umh_info *info' is populated with two pipes
@@ -500,7 +512,7 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 
 	err = -ENOMEM;
 	sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
-						  umh_save_pid, info);
+						  umh_clean_and_save_pid, info);
 	if (!sub_info)
 		goto out;
 
-- 
cgit v1.2.3


From 424c22fb62427362f1f660cd83fbdc41aec488b6 Mon Sep 17 00:00:00 2001
From: Vito Caputo <vcaputo@pengaru.com>
Date: Sun, 21 Oct 2018 04:33:03 -0700
Subject: af_unix.h: trivial whitespace cleanup

Replace spurious spaces with a tab and remove superfluous tab from
unix_sock struct.

Signed-off-by: Vito Caputo <vcaputo@pengaru.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index a5ba41b3b867..e2695c4bf358 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -52,7 +52,7 @@ struct unix_skb_parms {
 struct unix_sock {
 	/* WARNING: sk has to be the first member */
 	struct sock		sk;
-	struct unix_address     *addr;
+	struct unix_address	*addr;
 	struct path		path;
 	struct mutex		iolock, bindlock;
 	struct sock		*peer;
@@ -63,7 +63,7 @@ struct unix_sock {
 #define UNIX_GC_CANDIDATE	0
 #define UNIX_GC_MAYBE_CYCLE	1
 	struct socket_wq	peer_wq;
-	wait_queue_entry_t		peer_wake;
+	wait_queue_entry_t	peer_wake;
 };
 
 static inline struct unix_sock *unix_sk(const struct sock *sk)
-- 
cgit v1.2.3


From ba59d5705825fb9cab3ff092552802f4fefc3635 Mon Sep 17 00:00:00 2001
From: Mathias Thore <mathias.thore@infinera.com>
Date: Mon, 22 Oct 2018 14:55:50 +0200
Subject: net/wan/fsl_ucc_hdlc: error counters

Extract error information from rx and tx buffer descriptors,
and update error counters.

Signed-off-by: Mathias Thore <mathias.thore@infinera.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/fsl_ucc_hdlc.c | 51 ++++++++++++++++++++++++++++++++++++------
 include/soc/fsl/qe/ucc_fast.h  |  8 ++++++-
 2 files changed, 51 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index 8523ade16030..4d6409605207 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -36,6 +36,7 @@
 #define DRV_NAME "ucc_hdlc"
 
 #define TDM_PPPOHT_SLIC_MAXIN
+#define RX_BD_ERRORS (R_CD_S | R_OV_S | R_CR_S | R_AB_S | R_NO_S | R_LG_S)
 
 static struct ucc_tdm_info utdm_primary_info = {
 	.uf_info = {
@@ -430,12 +431,25 @@ static netdev_tx_t ucc_hdlc_tx(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
+static int hdlc_tx_restart(struct ucc_hdlc_private *priv)
+{
+	u32 cecr_subblock;
+
+	cecr_subblock =
+		ucc_fast_get_qe_cr_subblock(priv->ut_info->uf_info.ucc_num);
+
+	qe_issue_cmd(QE_RESTART_TX, cecr_subblock,
+		     QE_CR_PROTOCOL_UNSPECIFIED, 0);
+	return 0;
+}
+
 static int hdlc_tx_done(struct ucc_hdlc_private *priv)
 {
 	/* Start from the next BD that should be filled */
 	struct net_device *dev = priv->ndev;
 	struct qe_bd *bd;		/* BD pointer */
 	u16 bd_status;
+	int tx_restart = 0;
 
 	bd = priv->dirty_tx;
 	bd_status = ioread16be(&bd->status);
@@ -444,6 +458,15 @@ static int hdlc_tx_done(struct ucc_hdlc_private *priv)
 	while ((bd_status & T_R_S) == 0) {
 		struct sk_buff *skb;
 
+		if (bd_status & T_UN_S) { /* Underrun */
+			dev->stats.tx_fifo_errors++;
+			tx_restart = 1;
+		}
+		if (bd_status & T_CT_S) { /* Carrier lost */
+			dev->stats.tx_carrier_errors++;
+			tx_restart = 1;
+		}
+
 		/* BD contains already transmitted buffer.   */
 		/* Handle the transmitted buffer and release */
 		/* the BD to be used with the current frame  */
@@ -475,6 +498,9 @@ static int hdlc_tx_done(struct ucc_hdlc_private *priv)
 	}
 	priv->dirty_tx = bd;
 
+	if (tx_restart)
+		hdlc_tx_restart(priv);
+
 	return 0;
 }
 
@@ -493,11 +519,22 @@ static int hdlc_rx_done(struct ucc_hdlc_private *priv, int rx_work_limit)
 
 	/* while there are received buffers and BD is full (~R_E) */
 	while (!((bd_status & (R_E_S)) || (--rx_work_limit < 0))) {
-		if (bd_status & R_OV_S)
-			dev->stats.rx_over_errors++;
-		if (bd_status & R_CR_S) {
-			dev->stats.rx_crc_errors++;
-			dev->stats.rx_dropped++;
+		if (bd_status & (RX_BD_ERRORS)) {
+			dev->stats.rx_errors++;
+
+			if (bd_status & R_CD_S)
+				dev->stats.collisions++;
+			if (bd_status & R_OV_S)
+				dev->stats.rx_fifo_errors++;
+			if (bd_status & R_CR_S)
+				dev->stats.rx_crc_errors++;
+			if (bd_status & R_AB_S)
+				dev->stats.rx_over_errors++;
+			if (bd_status & R_NO_S)
+				dev->stats.rx_frame_errors++;
+			if (bd_status & R_LG_S)
+				dev->stats.rx_length_errors++;
+
 			goto recycle;
 		}
 		bdbuffer = priv->rx_buffer +
@@ -546,7 +583,7 @@ static int hdlc_rx_done(struct ucc_hdlc_private *priv, int rx_work_limit)
 		netif_receive_skb(skb);
 
 recycle:
-		iowrite16be(bd_status | R_E_S | R_I_S, &bd->status);
+		iowrite16be((bd_status & R_W_S) | R_E_S | R_I_S, &bd->status);
 
 		/* update to point at the next bd */
 		if (bd_status & R_W_S) {
@@ -622,7 +659,7 @@ static irqreturn_t ucc_hdlc_irq_handler(int irq, void *dev_id)
 
 	/* Errors and other events */
 	if (ucce >> 16 & UCC_HDLC_UCCE_BSY)
-		dev->stats.rx_errors++;
+		dev->stats.rx_missed_errors++;
 	if (ucce >> 16 & UCC_HDLC_UCCE_TXE)
 		dev->stats.tx_errors++;
 
diff --git a/include/soc/fsl/qe/ucc_fast.h b/include/soc/fsl/qe/ucc_fast.h
index 3ee9e7c1a7d7..dcd6b865b590 100644
--- a/include/soc/fsl/qe/ucc_fast.h
+++ b/include/soc/fsl/qe/ucc_fast.h
@@ -41,8 +41,12 @@
 #define R_L_S	0x0800	/* last */
 #define R_F_S	0x0400	/* first */
 #define R_CM_S	0x0200	/* continuous mode */
+#define R_LG_S  0x0020  /* frame length */
+#define R_NO_S  0x0010  /* nonoctet */
+#define R_AB_S  0x0008  /* abort */
 #define R_CR_S	0x0004	/* crc */
-#define R_OV_S	0x0002	/* crc */
+#define R_OV_S	0x0002	/* overrun */
+#define R_CD_S  0x0001  /* carrier detect */
 
 /* transmit BD's status */
 #define T_R_S	0x8000	/* ready bit */
@@ -51,6 +55,8 @@
 #define T_L_S	0x0800	/* last */
 #define T_TC_S	0x0400	/* crc */
 #define T_TM_S	0x0200	/* continuous mode */
+#define T_UN_S  0x0002  /* hdlc underrun */
+#define T_CT_S  0x0001  /* hdlc carrier lost */
 
 /* Rx Data buffer must be 4 bytes aligned in most cases */
 #define UCC_FAST_RX_ALIGN			4
-- 
cgit v1.2.3


From 89ab066d4229acd32e323f1569833302544a4186 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 23 Oct 2018 13:40:39 +0200
Subject: Revert "net: simplify sock_poll_wait"

This reverts commit dd979b4df817e9976f18fb6f9d134d6bc4a3c317.

This broke tcp_poll for SMC fallback: An AF_SMC socket establishes an
internal TCP socket for the initial handshake with the remote peer.
Whenever the SMC connection can not be established this TCP socket is
used as a fallback. All socket operations on the SMC socket are then
forwarded to the TCP socket. In case of poll, the file->private_data
pointer references the SMC socket because the TCP socket has no file
assigned. This causes tcp_poll to wait on the wrong socket.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/af_alg.c        |  2 +-
 include/net/sock.h     | 12 +++++++++---
 net/atm/common.c       |  2 +-
 net/caif/caif_socket.c |  2 +-
 net/core/datagram.c    |  2 +-
 net/dccp/proto.c       |  2 +-
 net/ipv4/tcp.c         |  2 +-
 net/iucv/af_iucv.c     |  2 +-
 net/nfc/llcp_sock.c    |  2 +-
 net/rxrpc/af_rxrpc.c   |  2 +-
 net/smc/af_smc.c       |  2 +-
 net/tipc/socket.c      |  2 +-
 net/unix/af_unix.c     |  4 ++--
 13 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index b053179e0bc5..17eb09d222ff 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -1071,7 +1071,7 @@ __poll_t af_alg_poll(struct file *file, struct socket *sock,
 	struct af_alg_ctx *ctx = ask->private;
 	__poll_t mask;
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 	mask = 0;
 
 	if (!ctx->more || ctx->used)
diff --git a/include/net/sock.h b/include/net/sock.h
index 2440f8b407eb..f665d74ae509 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2059,14 +2059,20 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq)
 /**
  * sock_poll_wait - place memory barrier behind the poll_wait call.
  * @filp:           file
+ * @sock:           socket to wait on
  * @p:              poll_table
  *
  * See the comments in the wq_has_sleeper function.
+ *
+ * Do not derive sock from filp->private_data here. An SMC socket establishes
+ * an internal TCP socket that is used in the fallback case. All socket
+ * operations on the SMC socket are then forwarded to the TCP socket. In case of
+ * poll, the filp->private_data pointer references the SMC socket because the
+ * TCP socket has no file assigned.
  */
-static inline void sock_poll_wait(struct file *filp, poll_table *p)
+static inline void sock_poll_wait(struct file *filp, struct socket *sock,
+				  poll_table *p)
 {
-	struct socket *sock = filp->private_data;
-
 	if (!poll_does_not_wait(p)) {
 		poll_wait(filp, &sock->wq->wait, p);
 		/* We need to be sure we are in sync with the
diff --git a/net/atm/common.c b/net/atm/common.c
index 9f8cb0d2e71e..a38c174fc766 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -653,7 +653,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
 	struct atm_vcc *vcc;
 	__poll_t mask;
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 	mask = 0;
 
 	vcc = ATM_SD(sock);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index d18965f3291f..416717c57cd1 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -941,7 +941,7 @@ static __poll_t caif_poll(struct file *file,
 	__poll_t mask;
 	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 	mask = 0;
 
 	/* exceptional events? */
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 9aac0d63d53e..6a034eb538a1 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -837,7 +837,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
 	struct sock *sk = sock->sk;
 	__poll_t mask;
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 	mask = 0;
 
 	/* exceptional events? */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 875858c8b059..43733accf58e 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -325,7 +325,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
 	__poll_t mask;
 	struct sock *sk = sock->sk;
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 	if (sk->sk_state == DCCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b8ba8fa34eff..1834818ed07b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	int state;
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 
 	state = inet_sk_state_load(sk);
 	if (state == TCP_LISTEN)
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 45115c125569..0bed4cc20603 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1504,7 +1504,7 @@ __poll_t iucv_sock_poll(struct file *file, struct socket *sock,
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 
 	if (sk->sk_state == IUCV_LISTEN)
 		return iucv_accept_poll(sk);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index dd4adf8b1167..ae296273ce3d 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -556,7 +556,7 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
 
 	pr_debug("%p\n", sk);
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 
 	if (sk->sk_state == LLCP_LISTEN)
 		return llcp_accept_poll(sk);
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 013dbcb052e5..64362d078da8 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -756,7 +756,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock,
 	struct rxrpc_sock *rx = rxrpc_sk(sk);
 	__poll_t mask;
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 	mask = 0;
 
 	/* the socket is readable if there are any messages waiting on the Rx
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 015231789ed2..80e2119f1c70 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1543,7 +1543,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 			mask |= EPOLLERR;
 	} else {
 		if (sk->sk_state != SMC_CLOSED)
-			sock_poll_wait(file, wait);
+			sock_poll_wait(file, sock, wait);
 		if (sk->sk_err)
 			mask |= EPOLLERR;
 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index de09f514428c..636e6131769d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -717,7 +717,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
 	struct tipc_sock *tsk = tipc_sk(sk);
 	__poll_t revents = 0;
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
 		revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 98d34fb61744..74d1eed7cbd4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2642,7 +2642,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
 	struct sock *sk = sock->sk;
 	__poll_t mask;
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 	mask = 0;
 
 	/* exceptional events? */
@@ -2679,7 +2679,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
 	unsigned int writable;
 	__poll_t mask;
 
-	sock_poll_wait(file, wait);
+	sock_poll_wait(file, sock, wait);
 	mask = 0;
 
 	/* exceptional events? */
-- 
cgit v1.2.3


From 3f80e08f40cdb308589a49077c87632fa4508b21 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 23 Oct 2018 11:54:16 -0700
Subject: tcp: add tcp_reset_xmit_timer() helper

With EDT model, SRTT no longer is inflated by pacing delays.

This means that RTO and some other xmit timers might be setup
incorrectly. This is particularly visible with either :

- Very small enforced pacing rates (SO_MAX_PACING_RATE)
- Reduced rto (from the default 200 ms)

This can lead to TCP flows aborts in the worst case,
or spurious retransmits in other cases.

For example, this session gets far more throughput
than the requested 80kbit :

$ netperf -H 127.0.0.2 -l 100 -- -q 10000
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.2 () port 0 AF_INET
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

540000 262144 262144    104.00      2.66

With the fix :

$ netperf -H 127.0.0.2 -l 100 -- -q 10000
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.2 () port 0 AF_INET
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

540000 262144 262144    104.00      0.12

EDT allows for better control of rtx timers, since TCP has
a better idea of the earliest departure time of each skb
in the rtx queue. We only have to eventually add to the
timer the difference of the EDT time with current time.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 30 +++++++++++++++++++++++++++---
 net/ipv4/tcp_input.c  |  8 ++++----
 net/ipv4/tcp_output.c | 18 ++++++++++--------
 3 files changed, 41 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8a61c3e8c15d..a18914d20486 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1245,8 +1245,31 @@ static inline bool tcp_needs_internal_pacing(const struct sock *sk)
 	return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
 }
 
+/* Return in jiffies the delay before one skb is sent.
+ * If @skb is NULL, we look at EDT for next packet being sent on the socket.
+ */
+static inline unsigned long tcp_pacing_delay(const struct sock *sk,
+					     const struct sk_buff *skb)
+{
+	s64 pacing_delay = skb ? skb->tstamp : tcp_sk(sk)->tcp_wstamp_ns;
+
+	pacing_delay -= tcp_sk(sk)->tcp_clock_cache;
+
+	return pacing_delay > 0 ? nsecs_to_jiffies(pacing_delay) : 0;
+}
+
+static inline void tcp_reset_xmit_timer(struct sock *sk,
+					const int what,
+					unsigned long when,
+					const unsigned long max_when,
+					const struct sk_buff *skb)
+{
+	inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk, skb),
+				  max_when);
+}
+
 /* Something is really bad, we could not queue an additional packet,
- * because qdisc is full or receiver sent a 0 window.
+ * because qdisc is full or receiver sent a 0 window, or we are paced.
  * We do not want to add fuel to the fire, or abort too early,
  * so make sure the timer we arm now is at least 200ms in the future,
  * regardless of current icsk_rto value (as it could be ~2ms)
@@ -1268,8 +1291,9 @@ static inline unsigned long tcp_probe0_when(const struct sock *sk,
 static inline void tcp_check_probe_timer(struct sock *sk)
 {
 	if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
-		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-					  tcp_probe0_base(sk), TCP_RTO_MAX);
+		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+				     tcp_probe0_base(sk), TCP_RTO_MAX,
+				     NULL);
 }
 
 static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 188980c58f87..2868ef28ce52 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2979,8 +2979,8 @@ void tcp_rearm_rto(struct sock *sk)
 			 */
 			rto = usecs_to_jiffies(max_t(int, delta_us, 1));
 		}
-		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
-					  TCP_RTO_MAX);
+		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+				     TCP_RTO_MAX, tcp_rtx_queue_head(sk));
 	}
 }
 
@@ -3255,8 +3255,8 @@ static void tcp_ack_probe(struct sock *sk)
 	} else {
 		unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
 
-		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-					  when, TCP_RTO_MAX);
+		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+				     when, TCP_RTO_MAX, NULL);
 	}
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c07990a35ff3..9c34b97d365d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2455,8 +2455,8 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
 	if (rto_delta_us > 0)
 		timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
 
-	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
-				  TCP_RTO_MAX);
+	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
+			     TCP_RTO_MAX, NULL);
 	return true;
 }
 
@@ -3020,9 +3020,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 
 		if (skb == rtx_head &&
 		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
-			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-						  inet_csk(sk)->icsk_rto,
-						  TCP_RTO_MAX);
+			tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					     inet_csk(sk)->icsk_rto,
+					     TCP_RTO_MAX,
+					     skb);
 	}
 }
 
@@ -3752,9 +3753,10 @@ void tcp_send_probe0(struct sock *sk)
 			icsk->icsk_probes_out = 1;
 		probe_max = TCP_RESOURCE_PROBE_INTERVAL;
 	}
-	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-				  tcp_probe0_when(sk, probe_max),
-				  TCP_RTO_MAX);
+	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+			     tcp_probe0_when(sk, probe_max),
+			     TCP_RTO_MAX,
+			     NULL);
 }
 
 int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
-- 
cgit v1.2.3